LLVM 13.0.0
ARMISelLowering.cpp
Go to the documentation of this file.
1//===- ARMISelLowering.cpp - ARM DAG Lowering Implementation --------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that ARM uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "ARMISelLowering.h"
15#include "ARMBaseInstrInfo.h"
16#include "ARMBaseRegisterInfo.h"
17#include "ARMCallingConv.h"
20#include "ARMPerfectShuffle.h"
21#include "ARMRegisterInfo.h"
22#include "ARMSelectionDAGInfo.h"
23#include "ARMSubtarget.h"
27#include "Utils/ARMBaseInfo.h"
28#include "llvm/ADT/APFloat.h"
29#include "llvm/ADT/APInt.h"
30#include "llvm/ADT/ArrayRef.h"
31#include "llvm/ADT/BitVector.h"
32#include "llvm/ADT/DenseMap.h"
33#include "llvm/ADT/STLExtras.h"
36#include "llvm/ADT/Statistic.h"
38#include "llvm/ADT/StringRef.h"
40#include "llvm/ADT/Triple.h"
41#include "llvm/ADT/Twine.h"
65#include "llvm/IR/Attributes.h"
66#include "llvm/IR/CallingConv.h"
67#include "llvm/IR/Constant.h"
68#include "llvm/IR/Constants.h"
69#include "llvm/IR/DataLayout.h"
70#include "llvm/IR/DebugLoc.h"
72#include "llvm/IR/Function.h"
73#include "llvm/IR/GlobalAlias.h"
74#include "llvm/IR/GlobalValue.h"
76#include "llvm/IR/IRBuilder.h"
77#include "llvm/IR/InlineAsm.h"
78#include "llvm/IR/Instruction.h"
81#include "llvm/IR/Intrinsics.h"
82#include "llvm/IR/IntrinsicsARM.h"
83#include "llvm/IR/Module.h"
85#include "llvm/IR/Type.h"
86#include "llvm/IR/User.h"
87#include "llvm/IR/Value.h"
88#include "llvm/MC/MCInstrDesc.h"
91#include "llvm/MC/MCSchedule.h"
98#include "llvm/Support/Debug.h"
106#include <algorithm>
107#include <cassert>
108#include <cstdint>
109#include <cstdlib>
110#include <iterator>
111#include <limits>
112#include <string>
113#include <tuple>
114#include <utility>
115#include <vector>
116
117using namespace llvm;
118using namespace llvm::PatternMatch;
119
120#define DEBUG_TYPE "arm-isel"
121
122STATISTIC(NumTailCalls, "Number of tail calls");
123STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt");
124STATISTIC(NumLoopByVals, "Number of loops generated for byval arguments");
126 "Number of constants with their storage promoted into constant pools");
127
128static cl::opt<bool>
129ARMInterworking("arm-interworking", cl::Hidden,
130 cl::desc("Enable / disable ARM interworking (for debugging only)"),
131 cl::init(true));
132
134 "arm-promote-constant", cl::Hidden,
135 cl::desc("Enable / disable promotion of unnamed_addr constants into "
136 "constant pools"),
137 cl::init(false)); // FIXME: set to true by default once PR32780 is fixed
139 "arm-promote-constant-max-size", cl::Hidden,
140 cl::desc("Maximum size of constant to promote into a constant pool"),
141 cl::init(64));
143 "arm-promote-constant-max-total", cl::Hidden,
144 cl::desc("Maximum size of ALL constants to promote into a constant pool"),
145 cl::init(128));
146
148MVEMaxSupportedInterleaveFactor("mve-max-interleave-factor", cl::Hidden,
149 cl::desc("Maximum interleave factor for MVE VLDn to generate."),
150 cl::init(2));
151
152// The APCS parameter registers.
153static const MCPhysReg GPRArgRegs[] = {
154 ARM::R0, ARM::R1, ARM::R2, ARM::R3
155};
156
157void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT) {
158 if (VT != PromotedLdStVT) {
161
164 }
165
166 MVT ElemTy = VT.getVectorElementType();
167 if (ElemTy != MVT::f64)
171 if (ElemTy == MVT::i32) {
176 } else {
181 }
190 if (VT.isInteger()) {
194 }
195
196 // Neon does not support vector divide/remainder operations.
205
206 if (!VT.isFloatingPoint() &&
207 VT != MVT::v2i64 && VT != MVT::v1i64)
208 for (auto Opcode : {ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
209 setOperationAction(Opcode, VT, Legal);
210 if (!VT.isFloatingPoint())
211 for (auto Opcode : {ISD::SADDSAT, ISD::UADDSAT, ISD::SSUBSAT, ISD::USUBSAT})
212 setOperationAction(Opcode, VT, Legal);
213}
214
215void ARMTargetLowering::addDRTypeForNEON(MVT VT) {
216 addRegisterClass(VT, &ARM::DPRRegClass);
217 addTypeForNEON(VT, MVT::f64);
218}
219
220void ARMTargetLowering::addQRTypeForNEON(MVT VT) {
221 addRegisterClass(VT, &ARM::DPairRegClass);
222 addTypeForNEON(VT, MVT::v2f64);
223}
224
225void ARMTargetLowering::setAllExpand(MVT VT) {
226 for (unsigned Opc = 0; Opc < ISD::BUILTIN_OP_END; ++Opc)
227 setOperationAction(Opc, VT, Expand);
228
229 // We support these really simple operations even on types where all
230 // the actual arithmetic has to be broken down into simpler
231 // operations or turned into library calls.
236}
237
238void ARMTargetLowering::addAllExtLoads(const MVT From, const MVT To,
239 LegalizeAction Action) {
240 setLoadExtAction(ISD::EXTLOAD, From, To, Action);
241 setLoadExtAction(ISD::ZEXTLOAD, From, To, Action);
242 setLoadExtAction(ISD::SEXTLOAD, From, To, Action);
243}
244
245void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
247
248 for (auto VT : IntTypes) {
249 addRegisterClass(VT, &ARM::MQPRRegClass);
275
276 // No native support for these.
286
287 // Vector reductions
297
298 if (!HasMVEFP) {
303 }
304
305 // Pre and Post inc are supported on loads and stores
306 for (unsigned im = (unsigned)ISD::PRE_INC;
312 }
313 }
314
315 const MVT FloatTypes[] = { MVT::v8f16, MVT::v4f32 };
316 for (auto VT : FloatTypes) {
317 addRegisterClass(VT, &ARM::MQPRRegClass);
318 if (!HasMVEFP)
319 setAllExpand(VT);
320
321 // These are legal or custom whether we have MVE.fp or not
334
335 // Pre and Post inc are supported on loads and stores
336 for (unsigned im = (unsigned)ISD::PRE_INC;
342 }
343
344 if (HasMVEFP) {
352
353 // No native support for these.
366 }
367 }
368
369 // Custom Expand smaller than legal vector reductions to prevent false zero
370 // items being added.
379
380 // We 'support' these types up to bitcast/load/store level, regardless of
381 // MVE integer-only / float support. Only doing FP data processing on the FP
382 // vector types is inhibited at integer-only level.
383 const MVT LongTypes[] = { MVT::v2i64, MVT::v2f64 };
384 for (auto VT : LongTypes) {
385 addRegisterClass(VT, &ARM::MQPRRegClass);
386 setAllExpand(VT);
390 }
392
393 // We can do bitwise operations on v2i64 vectors
397
398 // It is legal to extload from v4i8 to v4i16 or v4i32.
399 addAllExtLoads(MVT::v8i16, MVT::v8i8, Legal);
400 addAllExtLoads(MVT::v4i32, MVT::v4i16, Legal);
401 addAllExtLoads(MVT::v4i32, MVT::v4i8, Legal);
402
403 // It is legal to sign extend from v4i8/v4i16 to v4i32 or v8i8 to v8i16.
409
410 // Some truncating stores are legal too.
414
415 // Pre and Post inc on these are legal, given the correct extends
416 for (unsigned im = (unsigned)ISD::PRE_INC;
418 for (auto VT : {MVT::v8i8, MVT::v4i8, MVT::v4i16}) {
423 }
424 }
425
426 // Predicate types
428 for (auto VT : pTypes) {
429 addRegisterClass(VT, &ARM::VCCRRegClass);
443 }
452}
453
455 const ARMSubtarget &STI)
456 : TargetLowering(TM), Subtarget(&STI) {
457 RegInfo = Subtarget->getRegisterInfo();
458 Itins = Subtarget->getInstrItineraryData();
459
462
463 if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetIOS() &&
464 !Subtarget->isTargetWatchOS()) {
465 bool IsHFTarget = TM.Options.FloatABIType == FloatABI::Hard;
466 for (int LCID = 0; LCID < RTLIB::UNKNOWN_LIBCALL; ++LCID)
470 }
471
472 if (Subtarget->isTargetMachO()) {
473 // Uses VFP for Thumb libfuncs if available.
474 if (Subtarget->isThumb() && Subtarget->hasVFP2Base() &&
475 Subtarget->hasARMOps() && !Subtarget->useSoftFloat()) {
476 static const struct {
477 const RTLIB::Libcall Op;
478 const char * const Name;
479 const ISD::CondCode Cond;
480 } LibraryCalls[] = {
481 // Single-precision floating-point arithmetic.
482 { RTLIB::ADD_F32, "__addsf3vfp", ISD::SETCC_INVALID },
483 { RTLIB::SUB_F32, "__subsf3vfp", ISD::SETCC_INVALID },
484 { RTLIB::MUL_F32, "__mulsf3vfp", ISD::SETCC_INVALID },
485 { RTLIB::DIV_F32, "__divsf3vfp", ISD::SETCC_INVALID },
486
487 // Double-precision floating-point arithmetic.
488 { RTLIB::ADD_F64, "__adddf3vfp", ISD::SETCC_INVALID },
489 { RTLIB::SUB_F64, "__subdf3vfp", ISD::SETCC_INVALID },
490 { RTLIB::MUL_F64, "__muldf3vfp", ISD::SETCC_INVALID },
491 { RTLIB::DIV_F64, "__divdf3vfp", ISD::SETCC_INVALID },
492
493 // Single-precision comparisons.
494 { RTLIB::OEQ_F32, "__eqsf2vfp", ISD::SETNE },
495 { RTLIB::UNE_F32, "__nesf2vfp", ISD::SETNE },
496 { RTLIB::OLT_F32, "__ltsf2vfp", ISD::SETNE },
497 { RTLIB::OLE_F32, "__lesf2vfp", ISD::SETNE },
498 { RTLIB::OGE_F32, "__gesf2vfp", ISD::SETNE },
499 { RTLIB::OGT_F32, "__gtsf2vfp", ISD::SETNE },
500 { RTLIB::UO_F32, "__unordsf2vfp", ISD::SETNE },
501
502 // Double-precision comparisons.
503 { RTLIB::OEQ_F64, "__eqdf2vfp", ISD::SETNE },
504 { RTLIB::UNE_F64, "__nedf2vfp", ISD::SETNE },
505 { RTLIB::OLT_F64, "__ltdf2vfp", ISD::SETNE },
506 { RTLIB::OLE_F64, "__ledf2vfp", ISD::SETNE },
507 { RTLIB::OGE_F64, "__gedf2vfp", ISD::SETNE },
508 { RTLIB::OGT_F64, "__gtdf2vfp", ISD::SETNE },
509 { RTLIB::UO_F64, "__unorddf2vfp", ISD::SETNE },
510
511 // Floating-point to integer conversions.
512 // i64 conversions are done via library routines even when generating VFP
513 // instructions, so use the same ones.
514 { RTLIB::FPTOSINT_F64_I32, "__fixdfsivfp", ISD::SETCC_INVALID },
515 { RTLIB::FPTOUINT_F64_I32, "__fixunsdfsivfp", ISD::SETCC_INVALID },
516 { RTLIB::FPTOSINT_F32_I32, "__fixsfsivfp", ISD::SETCC_INVALID },
517 { RTLIB::FPTOUINT_F32_I32, "__fixunssfsivfp", ISD::SETCC_INVALID },
518
519 // Conversions between floating types.
520 { RTLIB::FPROUND_F64_F32, "__truncdfsf2vfp", ISD::SETCC_INVALID },
521 { RTLIB::FPEXT_F32_F64, "__extendsfdf2vfp", ISD::SETCC_INVALID },
522
523 // Integer to floating-point conversions.
524 // i64 conversions are done via library routines even when generating VFP
525 // instructions, so use the same ones.
526 // FIXME: There appears to be some naming inconsistency in ARM libgcc:
527 // e.g., __floatunsidf vs. __floatunssidfvfp.
528 { RTLIB::SINTTOFP_I32_F64, "__floatsidfvfp", ISD::SETCC_INVALID },
529 { RTLIB::UINTTOFP_I32_F64, "__floatunssidfvfp", ISD::SETCC_INVALID },
530 { RTLIB::SINTTOFP_I32_F32, "__floatsisfvfp", ISD::SETCC_INVALID },
531 { RTLIB::UINTTOFP_I32_F32, "__floatunssisfvfp", ISD::SETCC_INVALID },
532 };
533
534 for (const auto &LC : LibraryCalls) {
535 setLibcallName(LC.Op, LC.Name);
536 if (LC.Cond != ISD::SETCC_INVALID)
537 setCmpLibcallCC(LC.Op, LC.Cond);
538 }
539 }
540 }
541
542 // These libcalls are not available in 32-bit.
543 setLibcallName(RTLIB::SHL_I128, nullptr);
544 setLibcallName(RTLIB::SRL_I128, nullptr);
545 setLibcallName(RTLIB::SRA_I128, nullptr);
546 setLibcallName(RTLIB::MUL_I128, nullptr);
547
548 // RTLIB
549 if (Subtarget->isAAPCS_ABI() &&
550 (Subtarget->isTargetAEABI() || Subtarget->isTargetGNUAEABI() ||
551 Subtarget->isTargetMuslAEABI() || Subtarget->isTargetAndroid())) {
552 static const struct {
553 const RTLIB::Libcall Op;
554 const char * const Name;
555 const CallingConv::ID CC;
556 const ISD::CondCode Cond;
557 } LibraryCalls[] = {
558 // Double-precision floating-point arithmetic helper functions
559 // RTABI chapter 4.1.2, Table 2
560 { RTLIB::ADD_F64, "__aeabi_dadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
561 { RTLIB::DIV_F64, "__aeabi_ddiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
562 { RTLIB::MUL_F64, "__aeabi_dmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
563 { RTLIB::SUB_F64, "__aeabi_dsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
564
565 // Double-precision floating-point comparison helper functions
566 // RTABI chapter 4.1.2, Table 3
567 { RTLIB::OEQ_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE },
568 { RTLIB::UNE_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ },
569 { RTLIB::OLT_F64, "__aeabi_dcmplt", CallingConv::ARM_AAPCS, ISD::SETNE },
570 { RTLIB::OLE_F64, "__aeabi_dcmple", CallingConv::ARM_AAPCS, ISD::SETNE },
571 { RTLIB::OGE_F64, "__aeabi_dcmpge", CallingConv::ARM_AAPCS, ISD::SETNE },
572 { RTLIB::OGT_F64, "__aeabi_dcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE },
573 { RTLIB::UO_F64, "__aeabi_dcmpun", CallingConv::ARM_AAPCS, ISD::SETNE },
574
575 // Single-precision floating-point arithmetic helper functions
576 // RTABI chapter 4.1.2, Table 4
577 { RTLIB::ADD_F32, "__aeabi_fadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
578 { RTLIB::DIV_F32, "__aeabi_fdiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
579 { RTLIB::MUL_F32, "__aeabi_fmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
580 { RTLIB::SUB_F32, "__aeabi_fsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
581
582 // Single-precision floating-point comparison helper functions
583 // RTABI chapter 4.1.2, Table 5
584 { RTLIB::OEQ_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE },
585 { RTLIB::UNE_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ },
586 { RTLIB::OLT_F32, "__aeabi_fcmplt", CallingConv::ARM_AAPCS, ISD::SETNE },
587 { RTLIB::OLE_F32, "__aeabi_fcmple", CallingConv::ARM_AAPCS, ISD::SETNE },
588 { RTLIB::OGE_F32, "__aeabi_fcmpge", CallingConv::ARM_AAPCS, ISD::SETNE },
589 { RTLIB::OGT_F32, "__aeabi_fcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE },
590 { RTLIB::UO_F32, "__aeabi_fcmpun", CallingConv::ARM_AAPCS, ISD::SETNE },
591
592 // Floating-point to integer conversions.
593 // RTABI chapter 4.1.2, Table 6
594 { RTLIB::FPTOSINT_F64_I32, "__aeabi_d2iz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
595 { RTLIB::FPTOUINT_F64_I32, "__aeabi_d2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
596 { RTLIB::FPTOSINT_F64_I64, "__aeabi_d2lz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
597 { RTLIB::FPTOUINT_F64_I64, "__aeabi_d2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
598 { RTLIB::FPTOSINT_F32_I32, "__aeabi_f2iz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
599 { RTLIB::FPTOUINT_F32_I32, "__aeabi_f2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
600 { RTLIB::FPTOSINT_F32_I64, "__aeabi_f2lz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
601 { RTLIB::FPTOUINT_F32_I64, "__aeabi_f2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
602
603 // Conversions between floating types.
604 // RTABI chapter 4.1.2, Table 7
605 { RTLIB::FPROUND_F64_F32, "__aeabi_d2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
606 { RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
607 { RTLIB::FPEXT_F32_F64, "__aeabi_f2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
608
609 // Integer to floating-point conversions.
610 // RTABI chapter 4.1.2, Table 8
611 { RTLIB::SINTTOFP_I32_F64, "__aeabi_i2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
612 { RTLIB::UINTTOFP_I32_F64, "__aeabi_ui2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
613 { RTLIB::SINTTOFP_I64_F64, "__aeabi_l2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
614 { RTLIB::UINTTOFP_I64_F64, "__aeabi_ul2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
615 { RTLIB::SINTTOFP_I32_F32, "__aeabi_i2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
616 { RTLIB::UINTTOFP_I32_F32, "__aeabi_ui2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
617 { RTLIB::SINTTOFP_I64_F32, "__aeabi_l2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
618 { RTLIB::UINTTOFP_I64_F32, "__aeabi_ul2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
619
620 // Long long helper functions
621 // RTABI chapter 4.2, Table 9
622 { RTLIB::MUL_I64, "__aeabi_lmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
623 { RTLIB::SHL_I64, "__aeabi_llsl", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
624 { RTLIB::SRL_I64, "__aeabi_llsr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
625 { RTLIB::SRA_I64, "__aeabi_lasr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
626
627 // Integer division functions
628 // RTABI chapter 4.3.1
629 { RTLIB::SDIV_I8, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
630 { RTLIB::SDIV_I16, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
631 { RTLIB::SDIV_I32, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
632 { RTLIB::SDIV_I64, "__aeabi_ldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
633 { RTLIB::UDIV_I8, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
634 { RTLIB::UDIV_I16, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
635 { RTLIB::UDIV_I32, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
636 { RTLIB::UDIV_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
637 };
638
639 for (const auto &LC : LibraryCalls) {
640 setLibcallName(LC.Op, LC.Name);
642 if (LC.Cond != ISD::SETCC_INVALID)
643 setCmpLibcallCC(LC.Op, LC.Cond);
644 }
645
646 // EABI dependent RTLIB
647 if (TM.Options.EABIVersion == EABI::EABI4 ||
648 TM.Options.EABIVersion == EABI::EABI5) {
649 static const struct {
650 const RTLIB::Libcall Op;
651 const char *const Name;
652 const CallingConv::ID CC;
653 const ISD::CondCode Cond;
654 } MemOpsLibraryCalls[] = {
655 // Memory operations
656 // RTABI chapter 4.3.4
657 { RTLIB::MEMCPY, "__aeabi_memcpy", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
658 { RTLIB::MEMMOVE, "__aeabi_memmove", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
659 { RTLIB::MEMSET, "__aeabi_memset", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
660 };
661
662 for (const auto &LC : MemOpsLibraryCalls) {
663 setLibcallName(LC.Op, LC.Name);
665 if (LC.Cond != ISD::SETCC_INVALID)
666 setCmpLibcallCC(LC.Op, LC.Cond);
667 }
668 }
669 }
670
671 if (Subtarget->isTargetWindows()) {
672 static const struct {
673 const RTLIB::Libcall Op;
674 const char * const Name;
675 const CallingConv::ID CC;
676 } LibraryCalls[] = {
677 { RTLIB::FPTOSINT_F32_I64, "__stoi64", CallingConv::ARM_AAPCS_VFP },
678 { RTLIB::FPTOSINT_F64_I64, "__dtoi64", CallingConv::ARM_AAPCS_VFP },
679 { RTLIB::FPTOUINT_F32_I64, "__stou64", CallingConv::ARM_AAPCS_VFP },
680 { RTLIB::FPTOUINT_F64_I64, "__dtou64", CallingConv::ARM_AAPCS_VFP },
681 { RTLIB::SINTTOFP_I64_F32, "__i64tos", CallingConv::ARM_AAPCS_VFP },
682 { RTLIB::SINTTOFP_I64_F64, "__i64tod", CallingConv::ARM_AAPCS_VFP },
683 { RTLIB::UINTTOFP_I64_F32, "__u64tos", CallingConv::ARM_AAPCS_VFP },
684 { RTLIB::UINTTOFP_I64_F64, "__u64tod", CallingConv::ARM_AAPCS_VFP },
685 };
686
687 for (const auto &LC : LibraryCalls) {
688 setLibcallName(LC.Op, LC.Name);
690 }
691 }
692
693 // Use divmod compiler-rt calls for iOS 5.0 and later.
694 if (Subtarget->isTargetMachO() &&
695 !(Subtarget->isTargetIOS() &&
696 Subtarget->getTargetTriple().isOSVersionLT(5, 0))) {
697 setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4");
698 setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4");
699 }
700
701 // The half <-> float conversion functions are always soft-float on
702 // non-watchos platforms, but are needed for some targets which use a
703 // hard-float calling convention by default.
704 if (!Subtarget->isTargetWatchABI()) {
705 if (Subtarget->isAAPCS_ABI()) {
706 setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_AAPCS);
707 setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_AAPCS);
708 setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_AAPCS);
709 } else {
710 setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_APCS);
711 setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_APCS);
712 setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_APCS);
713 }
714 }
715
716 // In EABI, these functions have an __aeabi_ prefix, but in GNUEABI they have
717 // a __gnu_ prefix (which is the default).
718 if (Subtarget->isTargetAEABI()) {
719 static const struct {
720 const RTLIB::Libcall Op;
721 const char * const Name;
722 const CallingConv::ID CC;
723 } LibraryCalls[] = {
724 { RTLIB::FPROUND_F32_F16, "__aeabi_f2h", CallingConv::ARM_AAPCS },
725 { RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS },
726 { RTLIB::FPEXT_F16_F32, "__aeabi_h2f", CallingConv::ARM_AAPCS },
727 };
728
729 for (const auto &LC : LibraryCalls) {
730 setLibcallName(LC.Op, LC.Name);
732 }
733 }
734
735 if (Subtarget->isThumb1Only())
736 addRegisterClass(MVT::i32, &ARM::tGPRRegClass);
737 else
738 addRegisterClass(MVT::i32, &ARM::GPRRegClass);
739
740 if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only() &&
741 Subtarget->hasFPRegs()) {
742 addRegisterClass(MVT::f32, &ARM::SPRRegClass);
743 addRegisterClass(MVT::f64, &ARM::DPRRegClass);
744 if (!Subtarget->hasVFP2Base())
745 setAllExpand(MVT::f32);
746 if (!Subtarget->hasFP64())
747 setAllExpand(MVT::f64);
748 }
749
750 if (Subtarget->hasFullFP16()) {
751 addRegisterClass(MVT::f16, &ARM::HPRRegClass);
754
757 }
758
759 if (Subtarget->hasBF16()) {
760 addRegisterClass(MVT::bf16, &ARM::HPRRegClass);
761 setAllExpand(MVT::bf16);
762 if (!Subtarget->hasFullFP16())
764 }
765
769 addAllExtLoads(VT, InnerVT, Expand);
770 }
771
774
776 }
777
780
783
784 if (Subtarget->hasMVEIntegerOps())
785 addMVEVectorTypes(Subtarget->hasMVEFloatOps());
786
787 // Combine low-overhead loop intrinsics so that we can lower i1 types.
788 if (Subtarget->hasLOB()) {
791 }
792
793 if (Subtarget->hasNEON()) {
794 addDRTypeForNEON(MVT::v2f32);
795 addDRTypeForNEON(MVT::v8i8);
796 addDRTypeForNEON(MVT::v4i16);
797 addDRTypeForNEON(MVT::v2i32);
798 addDRTypeForNEON(MVT::v1i64);
799
800 addQRTypeForNEON(MVT::v4f32);
801 addQRTypeForNEON(MVT::v2f64);
802 addQRTypeForNEON(MVT::v16i8);
803 addQRTypeForNEON(MVT::v8i16);
804 addQRTypeForNEON(MVT::v4i32);
805 addQRTypeForNEON(MVT::v2i64);
806
807 if (Subtarget->hasFullFP16()) {
808 addQRTypeForNEON(MVT::v8f16);
809 addDRTypeForNEON(MVT::v4f16);
810 }
811
812 if (Subtarget->hasBF16()) {
813 addQRTypeForNEON(MVT::v8bf16);
814 addDRTypeForNEON(MVT::v4bf16);
815 }
816 }
817
818 if (Subtarget->hasMVEIntegerOps() || Subtarget->hasNEON()) {
819 // v2f64 is legal so that QR subregs can be extracted as f64 elements, but
820 // none of Neon, MVE or VFP supports any arithmetic operations on it.
824 // FIXME: Code duplication: FDIV and FREM are expanded always, see
825 // ARMTargetLowering::addTypeForNEON method for details.
828 // FIXME: Create unittest.
829 // In another words, find a way when "copysign" appears in DAG with vector
830 // operands.
832 // FIXME: Code duplication: SETCC has custom operation action, see
833 // ARMTargetLowering::addTypeForNEON method for details.
835 // FIXME: Create unittest for FNEG and for FABS.
847 // FIXME: Create unittest for FCEIL, FTRUNC, FRINT, FNEARBYINT, FFLOOR.
854 }
855
856 if (Subtarget->hasNEON()) {
857 // The same with v4f32. But keep in mind that vadd, vsub, vmul are natively
858 // supported for v4f32.
873
874 // Mark v2f32 intrinsics.
889
890 // Neon does not support some operations on v1i64 and v2i64 types.
892 // Custom handling for some quad-vector types to detect VMULL.
896 // Custom handling for some vector types to avoid expensive expansions
901 // Neon does not have single instruction SINT_TO_FP and UINT_TO_FP with
902 // a destination type that is wider than the source, and nor does
903 // it have a FP_TO_[SU]INT instruction with a narrower destination than
904 // source.
913
916
917 // NEON does not have single instruction CTPOP for vectors with element
918 // types wider than 8-bits. However, custom lowering can leverage the
919 // v8i8/v16i8 vcnt instruction.
926
929
930 // NEON does not have single instruction CTTZ for vectors.
935
940
945
950
954 }
955
956 // NEON only has FMA instructions as of VFP4.
957 if (!Subtarget->hasVFP4Base()) {
960 }
961
969
970 // It is legal to extload from v4i8 to v4i16 or v4i32.
972 MVT::v2i32}) {
977 }
978 }
979 }
980
981 if (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) {
997 }
998 if (Subtarget->hasMVEIntegerOps()) {
1006 }
1007
1008 if (!Subtarget->hasFP64()) {
1009 // When targeting a floating-point unit with only single-precision
1010 // operations, f64 is legal for the few double-precision instructions which
1011 // are present However, no double-precision operations other than moves,
1012 // loads and stores are provided by the hardware.
1049 }
1050
1051 if (!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) {
1054 if (Subtarget->hasFullFP16()) {
1057 }
1058 }
1059
1060 if (!Subtarget->hasFP16()) {
1063 }
1064
1066
1067 // ARM does not have floating-point extending loads.
1068 for (MVT VT : MVT::fp_valuetypes()) {
1071 }
1072
1073 // ... or truncating stores
1077
1078 // ARM does not have i1 sign extending load.
1079 for (MVT VT : MVT::integer_valuetypes())
1081
1082 // ARM supports all 4 flavors of integer indexed load / store.
1083 if (!Subtarget->isThumb1Only()) {
1084 for (unsigned im = (unsigned)ISD::PRE_INC;
1094 }
1095 } else {
1096 // Thumb-1 has limited post-inc load/store support - LDM r0!, {r1}.
1099 }
1100
1105
1108 if (Subtarget->hasDSP()) {
1117 }
1118 if (Subtarget->hasBaseDSP()) {
1121 }
1122
1123 // i64 operation support.
1126 if (Subtarget->isThumb1Only()) {
1129 }
1130 if (Subtarget->isThumb1Only() || !Subtarget->hasV6Ops()
1131 || (Subtarget->isThumb2() && !Subtarget->hasDSP()))
1133
1143
1144 // MVE lowers 64 bit shifts to lsll and lsrl
1145 // assuming that ISD::SRL and SRA of i64 are already marked custom
1146 if (Subtarget->hasMVEIntegerOps())
1148
1149 // Expand to __aeabi_l{lsl,lsr,asr} calls for Thumb1.
1150 if (Subtarget->isThumb1Only()) {
1154 }
1155
1156 if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops())
1158
1159 // ARM does not have ROTL.
1164 }
1167 if (!Subtarget->hasV5TOps() || Subtarget->isThumb1Only()) {
1170 }
1171
1172 // @llvm.readcyclecounter requires the Performance Monitors extension.
1173 // Default to the 0 expansion on unsupported platforms.
1174 // FIXME: Technically there are older ARM CPUs that have
1175 // implementation-specific ways of obtaining this information.
1176 if (Subtarget->hasPerfMon())
1178
1179 // Only ARMv6 has BSWAP.
1180 if (!Subtarget->hasV6Ops())
1182
1183 bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode()
1184 : Subtarget->hasDivideInARMMode();
1185 if (!hasDivide) {
1186 // These are expanded into libcalls if the cpu doesn't have HW divider.
1189 }
1190
1191 if (Subtarget->isTargetWindows() && !Subtarget->hasDivideInThumbMode()) {
1194
1197 }
1198
1201
1202 // Register based DivRem for AEABI (RTABI 4.2)
1203 if (Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() ||
1204 Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() ||
1205 Subtarget->isTargetWindows()) {
1208 HasStandaloneRem = false;
1209
1210 if (Subtarget->isTargetWindows()) {
1211 const struct {
1212 const RTLIB::Libcall Op;
1213 const char * const Name;
1214 const CallingConv::ID CC;
1215 } LibraryCalls[] = {
1216 { RTLIB::SDIVREM_I8, "__rt_sdiv", CallingConv::ARM_AAPCS },
1217 { RTLIB::SDIVREM_I16, "__rt_sdiv", CallingConv::ARM_AAPCS },
1218 { RTLIB::SDIVREM_I32, "__rt_sdiv", CallingConv::ARM_AAPCS },
1219 { RTLIB::SDIVREM_I64, "__rt_sdiv64", CallingConv::ARM_AAPCS },
1220
1221 { RTLIB::UDIVREM_I8, "__rt_udiv", CallingConv::ARM_AAPCS },
1222 { RTLIB::UDIVREM_I16, "__rt_udiv", CallingConv::ARM_AAPCS },
1223 { RTLIB::UDIVREM_I32, "__rt_udiv", CallingConv::ARM_AAPCS },
1224 { RTLIB::UDIVREM_I64, "__rt_udiv64", CallingConv::ARM_AAPCS },
1225 };
1226
1227 for (const auto &LC : LibraryCalls) {
1228 setLibcallName(LC.Op, LC.Name);
1229 setLibcallCallingConv(LC.Op, LC.CC);
1230 }
1231 } else {
1232 const struct {
1233 const RTLIB::Libcall Op;
1234 const char * const Name;
1235 const CallingConv::ID CC;
1236 } LibraryCalls[] = {
1237 { RTLIB::SDIVREM_I8, "__aeabi_idivmod", CallingConv::ARM_AAPCS },
1238 { RTLIB::SDIVREM_I16, "__aeabi_idivmod", CallingConv::ARM_AAPCS },
1239 { RTLIB::SDIVREM_I32, "__aeabi_idivmod", CallingConv::ARM_AAPCS },
1240 { RTLIB::SDIVREM_I64, "__aeabi_ldivmod", CallingConv::ARM_AAPCS },
1241
1242 { RTLIB::UDIVREM_I8, "__aeabi_uidivmod", CallingConv::ARM_AAPCS },
1243 { RTLIB::UDIVREM_I16, "__aeabi_uidivmod", CallingConv::ARM_AAPCS },
1244 { RTLIB::UDIVREM_I32, "__aeabi_uidivmod", CallingConv::ARM_AAPCS },
1245 { RTLIB::UDIVREM_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS },
1246 };
1247
1248 for (const auto &LC : LibraryCalls) {
1249 setLibcallName(LC.Op, LC.Name);
1250 setLibcallCallingConv(LC.Op, LC.CC);
1251 }
1252 }
1253
1258 } else {
1261 }
1262
1263 if (Subtarget->getTargetTriple().isOSMSVCRT()) {
1264 // MSVCRT doesn't have powi; fall back to pow
1265 setLibcallName(RTLIB::POWI_F32, nullptr);
1266 setLibcallName(RTLIB::POWI_F64, nullptr);
1267 }
1268
1273
1276
1277 // Use the default implementation.
1284
1285 if (Subtarget->isTargetWindows())
1287 else
1289
1290 // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use
1291 // the default expansion.
1292 InsertFencesForAtomic = false;
1293 if (Subtarget->hasAnyDataBarrier() &&
1294 (!Subtarget->isThumb() || Subtarget->hasV8MBaselineOps())) {
1295 // ATOMIC_FENCE needs custom lowering; the others should have been expanded
1296 // to ldrex/strex loops already.
1298 if (!Subtarget->isThumb() || !Subtarget->isMClass())
1300
1301 // On v8, we have particularly efficient implementations of atomic fences
1302 // if they can be combined with nearby atomic loads and stores.
1303 if (!Subtarget->hasAcquireRelease() ||
1304 getTargetMachine().getOptLevel() == 0) {
1305 // Automatically insert fences (dmb ish) around ATOMIC_SWAP etc.
1306 InsertFencesForAtomic = true;
1307 }
1308 } else {
1309 // If there's anything we can use as a barrier, go through custom lowering
1310 // for ATOMIC_FENCE.
1311 // If target has DMB in thumb, Fences can be inserted.
1312 if (Subtarget->hasDataBarrier())
1313 InsertFencesForAtomic = true;
1314
1316 Subtarget->hasAnyDataBarrier() ? Custom : Expand);
1317
1318 // Set them all for expansion, which will force libcalls.
1331 // Mark ATOMIC_LOAD and ATOMIC_STORE custom so we can handle the
1332 // Unordered/Monotonic case.
1333 if (!InsertFencesForAtomic) {
1336 }
1337 }
1338
1340
1341 // Requires SXTB/SXTH, available on v6 and up in both ARM and Thumb modes.
1342 if (!Subtarget->hasV6Ops()) {
1345 }
1347
1348 if (!Subtarget->useSoftFloat() && Subtarget->hasFPRegs() &&
1349 !Subtarget->isThumb1Only()) {
1350 // Turn f64->i64 into VMOVRRD, i64 -> f64 to VMOVDRR
1351 // iff target supports vfp2.
1355 }
1356
1357 // We want to custom lower some of our intrinsics.
1362 if (Subtarget->useSjLjEH())
1363 setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
1364
1374 if (Subtarget->hasFullFP16()) {
1378 }
1379
1381
1384 if (Subtarget->hasFullFP16())
1389
1390 // We don't support sin/cos/fmod/copysign/pow
1399 if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2Base() &&
1400 !Subtarget->isThumb1Only()) {
1403 }
1406
1407 if (!Subtarget->hasVFP4Base()) {
1410 }
1411
1412 // Various VFP goodness
1413 if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only()) {
1414 // FP-ARMv8 adds f64 <-> f16 conversion. Before that it should be expanded.
1415 if (!Subtarget->hasFPARMv8Base() || !Subtarget->hasFP64()) {
1418 }
1419
1420 // fp16 is a special v7 extension that adds f16 <-> f32 conversions.
1421 if (!Subtarget->hasFP16()) {
1424 }
1425
1426 // Strict floating-point comparisons need custom lowering.
1433 }
1434
1435 // Use __sincos_stret if available.
1436 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
1437 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
1440 }
1441
1442 // FP-ARMv8 implements a lot of rounding-like FP operations.
1443 if (Subtarget->hasFPARMv8Base()) {
1452 if (Subtarget->hasNEON()) {
1457 }
1458
1459 if (Subtarget->hasFP64()) {
1468 }
1469 }
1470
1471 // FP16 often need to be promoted to call lib functions
1472 if (Subtarget->hasFullFP16()) {
1485
1487 }
1488
1489 if (Subtarget->hasNEON()) {
1490 // vmin and vmax aren't available in a scalar form, so we can use
1491 // a NEON instruction with an undef lane instead. This has a performance
1492 // penalty on some cores, so we don't do this unless we have been
1493 // asked to by the core tuning model.
1494 if (Subtarget->useNEONForSinglePrecisionFP()) {
1499 }
1504
1505 if (Subtarget->hasFullFP16()) {
1510
1515 }
1516 }
1517
1518 // We have target-specific dag combine patterns for the following nodes:
1519 // ARMISD::VMOVRRD - No need to call setTargetDAGCombine
1526
1527 if (Subtarget->hasMVEIntegerOps())
1529
1530 if (Subtarget->hasV6Ops())
1532 if (Subtarget->isThumb1Only())
1534
1536
1537 if (Subtarget->useSoftFloat() || Subtarget->isThumb1Only() ||
1538 !Subtarget->hasVFP2Base() || Subtarget->hasMinSize())
1540 else
1542
1543 //// temporary - rewrite interface to use type
1546 MaxStoresPerMemcpy = 4; // For @llvm.memcpy -> sequence of stores
1548 MaxStoresPerMemmove = 4; // For @llvm.memmove -> sequence of stores
1550
1551 // On ARM arguments smaller than 4 bytes are extended, so all arguments
1552 // are at least 4 bytes aligned.
1554
1555 // Prefer likely predicted branches to selects on out-of-order cores.
1556 PredictableSelectIsExpensive = Subtarget->getSchedModel().isOutOfOrder();
1557
1558 setPrefLoopAlignment(Align(1ULL << Subtarget->getPrefLoopLogAlignment()));
1559
1560 setMinFunctionAlignment(Subtarget->isThumb() ? Align(2) : Align(4));
1561
1562 if (Subtarget->isThumb() || Subtarget->isThumb2())
1564}
1565
1567 return Subtarget->useSoftFloat();
1568}
1569
1570// FIXME: It might make sense to define the representative register class as the
1571// nearest super-register that has a non-null superset. For example, DPR_VFP2 is
1572// a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently,
1573// SPR's representative would be DPR_VFP2. This should work well if register
1574// pressure tracking were modified such that a register use would increment the
1575// pressure of the register class's representative and all of it's super
1576// classes' representatives transitively. We have not implemented this because
1577// of the difficulty prior to coalescing of modeling operand register classes
1578// due to the common occurrence of cross class copies and subregister insertions
1579// and extractions.
1580std::pair<const TargetRegisterClass *, uint8_t>
1582 MVT VT) const {
1583 const TargetRegisterClass *RRC = nullptr;
1584 uint8_t Cost = 1;
1585 switch (VT.SimpleTy) {
1586 default:
1588 // Use DPR as representative register class for all floating point
1589 // and vector types. Since there are 32 SPR registers and 32 DPR registers so
1590 // the cost is 1 for both f32 and f64.
1591 case MVT::f32: case MVT::f64: case MVT::v8i8: case MVT::v4i16:
1592 case MVT::v2i32: case MVT::v1i64: case MVT::v2f32:
1593 RRC = &ARM::DPRRegClass;
1594 // When NEON is used for SP, only half of the register file is available
1595 // because operations that define both SP and DP results will be constrained
1596 // to the VFP2 class (D0-D15). We currently model this constraint prior to
1597 // coalescing by double-counting the SP regs. See the FIXME above.
1598 if (Subtarget->useNEONForSinglePrecisionFP())
1599 Cost = 2;
1600 break;
1601 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
1602 case MVT::v4f32: case MVT::v2f64:
1603 RRC = &ARM::DPRRegClass;
1604 Cost = 2;
1605 break;
1606 case MVT::v4i64:
1607 RRC = &ARM::DPRRegClass;
1608 Cost = 4;
1609 break;
1610 case MVT::v8i64:
1611 RRC = &ARM::DPRRegClass;
1612 Cost = 8;
1613 break;
1614 }
1615 return std::make_pair(RRC, Cost);
1616}
1617
1618const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
1619#define MAKE_CASE(V) \
1620 case V: \
1621 return #V;
1622 switch ((ARMISD::NodeType)Opcode) {
1624 break;
1827#undef MAKE_CASE
1828 }
1829 return nullptr;
1830}
1831
1833 EVT VT) const {
1834 if (!VT.isVector())
1835 return getPointerTy(DL);
1836
1837 // MVE has a predicate register.
1838 if ((Subtarget->hasMVEIntegerOps() &&
1839 (VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8)) ||
1840 (Subtarget->hasMVEFloatOps() && (VT == MVT::v4f32 || VT == MVT::v8f16)))
1843}
1844
1845/// getRegClassFor - Return the register class that should be used for the
1846/// specified value type.
1847const TargetRegisterClass *
1848ARMTargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
1849 (void)isDivergent;
1850 // Map v4i64 to QQ registers but do not make the type legal. Similarly map
1851 // v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to
1852 // load / store 4 to 8 consecutive NEON D registers, or 2 to 4 consecutive
1853 // MVE Q registers.
1854 if (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) {
1855 if (VT == MVT::v4i64)
1856 return &ARM::QQPRRegClass;
1857 if (VT == MVT::v8i64)
1858 return &ARM::QQQQPRRegClass;
1859 }
1861}
1862
1863// memcpy, and other memory intrinsics, typically tries to use LDM/STM if the
1864// source/dest is aligned and the copy size is large enough. We therefore want
1865// to align such objects passed to memory intrinsics.
1867 unsigned &PrefAlign) const {
1868 if (!isa<MemIntrinsic>(CI))
1869 return false;
1870 MinSize = 8;
1871 // On ARM11 onwards (excluding M class) 8-byte aligned LDM is typically 1
1872 // cycle faster than 4-byte aligned LDM.
1873 PrefAlign = (Subtarget->hasV6Ops() && !Subtarget->isMClass() ? 8 : 4);
1874 return true;
1875}
1876
1877// Create a fast isel object.
1878FastISel *
1883
1885 unsigned NumVals = N->getNumValues();
1886 if (!NumVals)
1887 return Sched::RegPressure;
1888
1889 for (unsigned i = 0; i != NumVals; ++i) {
1890 EVT VT = N->getValueType(i);
1891 if (VT == MVT::Glue || VT == MVT::Other)
1892 continue;
1893 if (VT.isFloatingPoint() || VT.isVector())
1894 return Sched::ILP;
1895 }
1896
1897 if (!N->isMachineOpcode())
1898 return Sched::RegPressure;
1899
1900 // Load are scheduled for latency even if there instruction itinerary
1901 // is not available.
1902 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
1903 const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());
1904
1905 if (MCID.getNumDefs() == 0)
1906 return Sched::RegPressure;
1907 if (!Itins->isEmpty() &&
1908 Itins->getOperandCycle(MCID.getSchedClass(), 0) > 2)
1909 return Sched::ILP;
1910
1911 return Sched::RegPressure;
1912}
1913
1914//===----------------------------------------------------------------------===//
1915// Lowering Code
1916//===----------------------------------------------------------------------===//
1917
1918static bool isSRL16(const SDValue &Op) {
1919 if (Op.getOpcode() != ISD::SRL)
1920 return false;
1921 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
1922 return Const->getZExtValue() == 16;
1923 return false;
1924}
1925
1926static bool isSRA16(const SDValue &Op) {
1927 if (Op.getOpcode() != ISD::SRA)
1928 return false;
1929 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
1930 return Const->getZExtValue() == 16;
1931 return false;
1932}
1933
1934static bool isSHL16(const SDValue &Op) {
1935 if (Op.getOpcode() != ISD::SHL)
1936 return false;
1937 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
1938 return Const->getZExtValue() == 16;
1939 return false;
1940}
1941
1942// Check for a signed 16-bit value. We special case SRA because it makes it
1943// more simple when also looking for SRAs that aren't sign extending a
1944// smaller value. Without the check, we'd need to take extra care with
1945// checking order for some operations.
1946static bool isS16(const SDValue &Op, SelectionDAG &DAG) {
1947 if (isSRA16(Op))
1948 return isSHL16(Op.getOperand(0));
1949 return DAG.ComputeNumSignBits(Op) == 17;
1950}
1951
1952/// IntCCToARMCC - Convert a DAG integer condition code to an ARM CC
1954 switch (CC) {
1955 default: llvm_unreachable("Unknown condition code!");
1956 case ISD::SETNE: return ARMCC::NE;
1957 case ISD::SETEQ: return ARMCC::EQ;
1958 case ISD::SETGT: return ARMCC::GT;
1959 case ISD::SETGE: return ARMCC::GE;
1960 case ISD::SETLT: return ARMCC::LT;
1961 case ISD::SETLE: return ARMCC::LE;
1962 case ISD::SETUGT: return ARMCC::HI;
1963 case ISD::SETUGE: return ARMCC::HS;
1964 case ISD::SETULT: return ARMCC::LO;
1965 case ISD::SETULE: return ARMCC::LS;
1966 }
1967}
1968
1969/// FPCCToARMCC - Convert a DAG fp condition code to an ARM CC.
1973 switch (CC) {
1974 default: llvm_unreachable("Unknown FP condition!");
1975 case ISD::SETEQ:
1976 case ISD::SETOEQ: CondCode = ARMCC::EQ; break;
1977 case ISD::SETGT:
1978 case ISD::SETOGT: CondCode = ARMCC::GT; break;
1979 case ISD::SETGE:
1980 case ISD::SETOGE: CondCode = ARMCC::GE; break;
1981 case ISD::SETOLT: CondCode = ARMCC::MI; break;
1982 case ISD::SETOLE: CondCode = ARMCC::LS; break;
1983 case ISD::SETONE: CondCode = ARMCC::MI; CondCode2 = ARMCC::GT; break;
1984 case ISD::SETO: CondCode = ARMCC::VC; break;
1985 case ISD::SETUO: CondCode = ARMCC::VS; break;
1986 case ISD::SETUEQ: CondCode = ARMCC::EQ; CondCode2 = ARMCC::VS; break;
1987 case ISD::SETUGT: CondCode = ARMCC::HI; break;
1988 case ISD::SETUGE: CondCode = ARMCC::PL; break;
1989 case ISD::SETLT:
1990 case ISD::SETULT: CondCode = ARMCC::LT; break;
1991 case ISD::SETLE:
1992 case ISD::SETULE: CondCode = ARMCC::LE; break;
1993 case ISD::SETNE:
1994 case ISD::SETUNE: CondCode = ARMCC::NE; break;
1995 }
1996}
1997
1998//===----------------------------------------------------------------------===//
1999// Calling Convention Implementation
2000//===----------------------------------------------------------------------===//
2001
2002/// getEffectiveCallingConv - Get the effective calling convention, taking into
2003/// account presence of floating point hardware and calling convention
2004/// limitations, such as support for variadic functions.
2006ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC,
2007 bool isVarArg) const {
2008 switch (CC) {
2009 default:
2010 report_fatal_error("Unsupported calling convention");
2013 case CallingConv::GHC:
2015 return CC;
2019 case CallingConv::Swift:
2022 case CallingConv::C:
2023 case CallingConv::Tail:
2024 if (!Subtarget->isAAPCS_ABI())
2025 return CallingConv::ARM_APCS;
2026 else if (Subtarget->hasVFP2Base() && !Subtarget->isThumb1Only() &&
2027 getTargetMachine().Options.FloatABIType == FloatABI::Hard &&
2028 !isVarArg)
2030 else
2032 case CallingConv::Fast:
2034 if (!Subtarget->isAAPCS_ABI()) {
2035 if (Subtarget->hasVFP2Base() && !Subtarget->isThumb1Only() && !isVarArg)
2036 return CallingConv::Fast;
2037 return CallingConv::ARM_APCS;
2038 } else if (Subtarget->hasVFP2Base() &&
2039 !Subtarget->isThumb1Only() && !isVarArg)
2041 else
2043 }
2044}
2045
2047 bool isVarArg) const {
2048 return CCAssignFnForNode(CC, false, isVarArg);
2049}
2050
2052 bool isVarArg) const {
2053 return CCAssignFnForNode(CC, true, isVarArg);
2054}
2055
2056/// CCAssignFnForNode - Selects the correct CCAssignFn for the given
2057/// CallingConvention.
2058CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC,
2059 bool Return,
2060 bool isVarArg) const {
2061 switch (getEffectiveCallingConv(CC, isVarArg)) {
2062 default:
2063 report_fatal_error("Unsupported calling convention");
2065 return (Return ? RetCC_ARM_APCS : CC_ARM_APCS);
2067 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
2069 return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP);
2070 case CallingConv::Fast:
2071 return (Return ? RetFastCC_ARM_APCS : FastCC_ARM_APCS);
2072 case CallingConv::GHC:
2073 return (Return ? RetCC_ARM_APCS : CC_ARM_APCS_GHC);
2075 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
2077 return (Return ? RetCC_ARM_AAPCS : CC_ARM_Win32_CFGuard_Check);
2078 }
2079}
2080
2081SDValue ARMTargetLowering::MoveToHPR(const SDLoc &dl, SelectionDAG &DAG,
2082 MVT LocVT, MVT ValVT, SDValue Val) const {
2083 Val = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocVT.getSizeInBits()),
2084 Val);
2085 if (Subtarget->hasFullFP16()) {
2086 Val = DAG.getNode(ARMISD::VMOVhr, dl, ValVT, Val);
2087 } else {
2088 Val = DAG.getNode(ISD::TRUNCATE, dl,
2089 MVT::getIntegerVT(ValVT.getSizeInBits()), Val);
2090 Val = DAG.getNode(ISD::BITCAST, dl, ValVT, Val);
2091 }
2092 return Val;
2093}
2094
2095SDValue ARMTargetLowering::MoveFromHPR(const SDLoc &dl, SelectionDAG &DAG,
2096 MVT LocVT, MVT ValVT,
2097 SDValue Val) const {
2098 if (Subtarget->hasFullFP16()) {
2099 Val = DAG.getNode(ARMISD::VMOVrh, dl,
2100 MVT::getIntegerVT(LocVT.getSizeInBits()), Val);
2101 } else {
2102 Val = DAG.getNode(ISD::BITCAST, dl,
2103 MVT::getIntegerVT(ValVT.getSizeInBits()), Val);
2104 Val = DAG.getNode(ISD::ZERO_EXTEND, dl,
2105 MVT::getIntegerVT(LocVT.getSizeInBits()), Val);
2106 }
2107 return DAG.getNode(ISD::BITCAST, dl, LocVT, Val);
2108}
2109
2110/// LowerCallResult - Lower the result values of a call into the
2111/// appropriate copies out of appropriate physical registers.
2112SDValue ARMTargetLowering::LowerCallResult(
2113 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
2114 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2116 SDValue ThisVal) const {
2117 // Assign locations to each value returned by this call.
2119 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2120 *DAG.getContext());
2121 CCInfo.AnalyzeCallResult(Ins, CCAssignFnForReturn(CallConv, isVarArg));
2122
2123 // Copy all of the result registers out of their specified physreg.
2124 for (unsigned i = 0; i != RVLocs.size(); ++i) {
2126
2127 // Pass 'this' value directly from the argument to return value, to avoid
2128 // reg unit interference
2129 if (i == 0 && isThisReturn) {
2130 assert(!VA.needsCustom() && VA.getLocVT() == MVT::i32 &&
2131 "unexpected return calling convention register assignment");
2132 InVals.push_back(ThisVal);
2133 continue;
2134 }
2135
2136 SDValue Val;
2137 if (VA.needsCustom() &&
2138 (VA.getLocVT() == MVT::f64 || VA.getLocVT() == MVT::v2f64)) {
2139 // Handle f64 or half of a v2f64.
2140 SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
2141 InFlag);
2142 Chain = Lo.getValue(1);
2143 InFlag = Lo.getValue(2);
2144 VA = RVLocs[++i]; // skip ahead to next loc
2145 SDValue Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
2146 InFlag);
2147 Chain = Hi.getValue(1);
2148 InFlag = Hi.getValue(2);
2149 if (!Subtarget->isLittle())
2150 std::swap (Lo, Hi);
2151 Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
2152
2153 if (VA.getLocVT() == MVT::v2f64) {
2154 SDValue Vec = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
2155 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
2156 DAG.getConstant(0, dl, MVT::i32));
2157
2158 VA = RVLocs[++i]; // skip ahead to next loc
2159 Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag);
2160 Chain = Lo.getValue(1);
2161 InFlag = Lo.getValue(2);
2162 VA = RVLocs[++i]; // skip ahead to next loc
2163 Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag);
2164 Chain = Hi.getValue(1);
2165 InFlag = Hi.getValue(2);
2166 if (!Subtarget->isLittle())
2167 std::swap (Lo, Hi);
2168 Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
2169 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
2170 DAG.getConstant(1, dl, MVT::i32));
2171 }
2172 } else {
2173 Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(),
2174 InFlag);
2175 Chain = Val.getValue(1);
2176 InFlag = Val.getValue(2);
2177 }
2178
2179 switch (VA.getLocInfo()) {
2180 default: llvm_unreachable("Unknown loc info!");
2181 case CCValAssign::Full: break;
2182 case CCValAssign::BCvt:
2183 Val = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), Val);
2184 break;
2185 }
2186
2187 // f16 arguments have their size extended to 4 bytes and passed as if they
2188 // had been copied to the LSBs of a 32-bit register.
2189 // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
2190 if (VA.needsCustom() &&
2191 (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16))
2192 Val = MoveToHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Val);
2193
2194 InVals.push_back(Val);
2195 }
2196
2197 return Chain;
2198}
2199
2200std::pair<SDValue, MachinePointerInfo> ARMTargetLowering::computeAddrForCallArg(
2201 const SDLoc &dl, SelectionDAG &DAG, const CCValAssign &VA, SDValue StackPtr,
2202 bool IsTailCall, int SPDiff) const {
2205 int32_t Offset = VA.getLocMemOffset();
2207
2208 if (IsTailCall) {
2209 Offset += SPDiff;
2210 auto PtrVT = getPointerTy(DAG.getDataLayout());
2211 int Size = VA.getLocVT().getFixedSizeInBits() / 8;
2212 int FI = MF.getFrameInfo().CreateFixedObject(Size, Offset, true);
2213 DstAddr = DAG.getFrameIndex(FI, PtrVT);
2214 DstInfo =
2216 } else {
2219 StackPtr, PtrOff);
2220 DstInfo =
2222 }
2223
2224 return std::make_pair(DstAddr, DstInfo);
2225}
2226
2227void ARMTargetLowering::PassF64ArgInRegs(const SDLoc &dl, SelectionDAG &DAG,
2228 SDValue Chain, SDValue &Arg,
2229 RegsToPassVector &RegsToPass,
2231 SDValue &StackPtr,
2233 bool IsTailCall,
2234 int SPDiff) const {
2237 unsigned id = Subtarget->isLittle() ? 0 : 1;
2238 RegsToPass.push_back(std::make_pair(VA.getLocReg(), fmrrd.getValue(id)));
2239
2240 if (NextVA.isRegLoc())
2241 RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), fmrrd.getValue(1-id)));
2242 else {
2243 assert(NextVA.isMemLoc());
2244 if (!StackPtr.getNode())
2245 StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP,
2247
2250 std::tie(DstAddr, DstInfo) =
2251 computeAddrForCallArg(dl, DAG, NextVA, StackPtr, IsTailCall, SPDiff);
2252 MemOpChains.push_back(
2253 DAG.getStore(Chain, dl, fmrrd.getValue(1 - id), DstAddr, DstInfo));
2254 }
2255}
2256
2261
2262/// LowerCall - Lowering a call into a callseq_start <-
2263/// ARMISD:CALL <- callseq_end chain. Also add input and output parameter
2264/// nodes.
2265SDValue
2266ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
2267 SmallVectorImpl<SDValue> &InVals) const {
2268 SelectionDAG &DAG = CLI.DAG;
2269 SDLoc &dl = CLI.DL;
2271 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
2273 SDValue Chain = CLI.Chain;
2274 SDValue Callee = CLI.Callee;
2275 bool &isTailCall = CLI.IsTailCall;
2276 CallingConv::ID CallConv = CLI.CallConv;
2277 bool doesNotRet = CLI.DoesNotReturn;
2278 bool isVarArg = CLI.IsVarArg;
2279
2283 bool isStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
2284 bool isThisReturn = false;
2285 bool isCmseNSCall = false;
2286 bool isSibCall = false;
2287 bool PreferIndirect = false;
2288
2289 // Determine whether this is a non-secure function call.
2290 if (CLI.CB && CLI.CB->getAttributes().hasFnAttribute("cmse_nonsecure_call"))
2291 isCmseNSCall = true;
2292
2293 // Disable tail calls if they're not supported.
2294 if (!Subtarget->supportsTailCall())
2295 isTailCall = false;
2296
2297 // For both the non-secure calls and the returns from a CMSE entry function,
2298 // the function needs to do some extra work afte r the call, or before the
2299 // return, respectively, thus it cannot end with atail call
2300 if (isCmseNSCall || AFI->isCmseNSEntryFunction())
2301 isTailCall = false;
2302
2304 // If we're optimizing for minimum size and the function is called three or
2305 // more times in this block, we can improve codesize by calling indirectly
2306 // as BLXr has a 16-bit encoding.
2307 auto *GV = cast<GlobalAddressSDNode>(Callee)->getGlobal();
2308 if (CLI.CB) {
2309 auto *BB = CLI.CB->getParent();
2310 PreferIndirect = Subtarget->isThumb() && Subtarget->hasMinSize() &&
2311 count_if(GV->users(), [&BB](const User *U) {
2312 return isa<Instruction>(U) &&
2313 cast<Instruction>(U)->getParent() == BB;
2314 }) > 2;
2315 }
2316 }
2317 if (isTailCall) {
2318 // Check if it's really possible to do a tail call.
2319 isTailCall = IsEligibleForTailCallOptimization(
2320 Callee, CallConv, isVarArg, isStructRet,
2321 MF.getFunction().hasStructRetAttr(), Outs, OutVals, Ins, DAG,
2323
2324 if (isTailCall && !getTargetMachine().Options.GuaranteedTailCallOpt &&
2325 CallConv != CallingConv::Tail && CallConv != CallingConv::SwiftTail)
2326 isSibCall = true;
2327
2328 // We don't support GuaranteedTailCallOpt for ARM, only automatically
2329 // detected sibcalls.
2330 if (isTailCall)
2331 ++NumTailCalls;
2332 }
2333
2334 if (!isTailCall && CLI.CB && CLI.CB->isMustTailCall())
2335 report_fatal_error("failed to perform tail call elimination on a call "
2336 "site marked musttail");
2337 // Analyze operands of the call, assigning locations to each operand.
2339 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
2340 *DAG.getContext());
2341 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CallConv, isVarArg));
2342
2343 // Get a count of how many bytes are to be pushed on the stack.
2344 unsigned NumBytes = CCInfo.getNextStackOffset();
2345
2346 // SPDiff is the byte offset of the call's argument area from the callee's.
2347 // Stores to callee stack arguments will be placed in FixedStackSlots offset
2348 // by this amount for a tail call. In a sibling call it must be 0 because the
2349 // caller will deallocate the entire stack and the callee still expects its
2350 // arguments to begin at SP+0. Completely unused for non-tail calls.
2351 int SPDiff = 0;
2352
2353 if (isTailCall && !isSibCall) {
2354 auto FuncInfo = MF.getInfo<ARMFunctionInfo>();
2355 unsigned NumReusableBytes = FuncInfo->getArgumentStackSize();
2356
2357 // Since callee will pop argument stack as a tail call, we must keep the
2358 // popped size 16-byte aligned.
2360 NumBytes = alignTo(NumBytes, StackAlign);
2361
2362 // SPDiff will be negative if this tail call requires more space than we
2363 // would automatically have in our incoming argument space. Positive if we
2364 // can actually shrink the stack.
2365 SPDiff = NumReusableBytes - NumBytes;
2366
2367 // If this call requires more stack than we have available from
2368 // LowerFormalArguments, tell FrameLowering to reserve space for it.
2369 if (SPDiff < 0 && AFI->getArgRegsSaveSize() < (unsigned)-SPDiff)
2371 }
2372
2373 if (isSibCall) {
2374 // For sibling tail calls, memory operands are available in our caller's stack.
2375 NumBytes = 0;
2376 } else {
2377 // Adjust the stack pointer for the new arguments...
2378 // These operations are automatically eliminated by the prolog/epilog pass
2379 Chain = DAG.getCALLSEQ_START(Chain, isTailCall ? 0 : NumBytes, 0, dl);
2380 }
2381
2383 DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy(DAG.getDataLayout()));
2384
2385 RegsToPassVector RegsToPass;
2387
2388 // During a tail call, stores to the argument area must happen after all of
2389 // the function's incoming arguments have been loaded because they may alias.
2390 // This is done by folding in a TokenFactor from LowerFormalArguments, but
2391 // there's no point in doing so repeatedly so this tracks whether that's
2392 // happened yet.
2393 bool AfterFormalArgLoads = false;
2394
2395 // Walk the register/memloc assignments, inserting copies/loads. In the case
2396 // of tail call optimization, arguments are handled later.
2397 for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
2398 i != e;
2399 ++i, ++realArgIdx) {
2400 CCValAssign &VA = ArgLocs[i];
2401 SDValue Arg = OutVals[realArgIdx];
2402 ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
2403 bool isByVal = Flags.isByVal();
2404
2405 // Promote the value if needed.
2406 switch (VA.getLocInfo()) {
2407 default: llvm_unreachable("Unknown loc info!");
2408 case CCValAssign::Full: break;
2409 case CCValAssign::SExt:
2410 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
2411 break;
2412 case CCValAssign::ZExt:
2413 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
2414 break;
2415 case CCValAssign::AExt:
2416 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
2417 break;
2418 case CCValAssign::BCvt:
2419 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
2420 break;
2421 }
2422
2423 if (isTailCall && VA.isMemLoc() && !AfterFormalArgLoads) {
2424 Chain = DAG.getStackArgumentTokenFactor(Chain);
2425 AfterFormalArgLoads = true;
2426 }
2427
2428 // f16 arguments have their size extended to 4 bytes and passed as if they
2429 // had been copied to the LSBs of a 32-bit register.
2430 // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
2431 if (VA.needsCustom() &&
2432 (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16)) {
2433 Arg = MoveFromHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Arg);
2434 } else {
2435 // f16 arguments could have been extended prior to argument lowering.
2436 // Mask them arguments if this is a CMSE nonsecure call.
2437 auto ArgVT = Outs[realArgIdx].ArgVT;
2438 if (isCmseNSCall && (ArgVT == MVT::f16)) {
2439 auto LocBits = VA.getLocVT().getSizeInBits();
2440 auto MaskValue = APInt::getLowBitsSet(LocBits, ArgVT.getSizeInBits());
2441 SDValue Mask =
2444 Arg = DAG.getNode(ISD::AND, dl, MVT::getIntegerVT(LocBits), Arg, Mask);
2445 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
2446 }
2447 }
2448
2449 // f64 and v2f64 might be passed in i32 pairs and must be split into pieces
2450 if (VA.needsCustom() && VA.getLocVT() == MVT::v2f64) {
2452 DAG.getConstant(0, dl, MVT::i32));
2454 DAG.getConstant(1, dl, MVT::i32));
2455
2456 PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass, VA, ArgLocs[++i],
2457 StackPtr, MemOpChains, isTailCall, SPDiff);
2458
2459 VA = ArgLocs[++i]; // skip ahead to next loc
2460 if (VA.isRegLoc()) {
2461 PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass, VA, ArgLocs[++i],
2462 StackPtr, MemOpChains, isTailCall, SPDiff);
2463 } else {
2464 assert(VA.isMemLoc());
2467 std::tie(DstAddr, DstInfo) =
2468 computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff);
2469 MemOpChains.push_back(DAG.getStore(Chain, dl, Op1, DstAddr, DstInfo));
2470 }
2471 } else if (VA.needsCustom() && VA.getLocVT() == MVT::f64) {
2472 PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i],
2473 StackPtr, MemOpChains, isTailCall, SPDiff);
2474 } else if (VA.isRegLoc()) {
2475 if (realArgIdx == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
2476 Outs[0].VT == MVT::i32) {
2477 assert(VA.getLocVT() == MVT::i32 &&
2478 "unexpected calling convention register assignment");
2479 assert(!Ins.empty() && Ins[0].VT == MVT::i32 &&
2480 "unexpected use of 'returned'");
2481 isThisReturn = true;
2482 }
2483 const TargetOptions &Options = DAG.getTarget().Options;
2484 if (Options.EmitCallSiteInfo)
2485 CSInfo.emplace_back(VA.getLocReg(), i);
2486 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
2487 } else if (isByVal) {
2488 assert(VA.isMemLoc());
2489 unsigned offset = 0;
2490
2491 // True if this byval aggregate will be split between registers
2492 // and memory.
2493 unsigned ByValArgsCount = CCInfo.getInRegsParamsCount();
2494 unsigned CurByValIdx = CCInfo.getInRegsParamsProcessed();
2495
2497
2498 unsigned RegBegin, RegEnd;
2499 CCInfo.getInRegsParamInfo(CurByValIdx, RegBegin, RegEnd);
2500
2501 EVT PtrVT =
2503 unsigned int i, j;
2504 for (i = 0, j = RegBegin; j < RegEnd; i++, j++) {
2505 SDValue Const = DAG.getConstant(4*i, dl, MVT::i32);
2506 SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const);
2507 SDValue Load =
2508 DAG.getLoad(PtrVT, dl, Chain, AddArg, MachinePointerInfo(),
2509 DAG.InferPtrAlign(AddArg));
2510 MemOpChains.push_back(Load.getValue(1));
2511 RegsToPass.push_back(std::make_pair(j, Load));
2512 }
2513
2514 // If parameter size outsides register area, "offset" value
2515 // helps us to calculate stack slot for remained part properly.
2516 offset = RegEnd - RegBegin;
2517
2518 CCInfo.nextInRegsParam();
2519 }
2520
2521 if (Flags.getByValSize() > 4*offset) {
2522 auto PtrVT = getPointerTy(DAG.getDataLayout());
2523 SDValue Dst;
2525 std::tie(Dst, DstInfo) =
2526 computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff);
2527 SDValue SrcOffset = DAG.getIntPtrConstant(4*offset, dl);
2528 SDValue Src = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, SrcOffset);
2529 SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset, dl,
2530 MVT::i32);
2532 DAG.getConstant(Flags.getNonZeroByValAlign().value(), dl, MVT::i32);
2533
2535 SDValue Ops[] = { Chain, Dst, Src, SizeNode, AlignNode};
2536 MemOpChains.push_back(DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs,
2537 Ops));
2538 }
2539 } else {
2540 assert(VA.isMemLoc());
2543 std::tie(DstAddr, DstInfo) =
2544 computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff);
2545
2546 SDValue Store = DAG.getStore(Chain, dl, Arg, DstAddr, DstInfo);
2547 MemOpChains.push_back(Store);
2548 }
2549 }
2550
2551 if (!MemOpChains.empty())
2553
2554 // Build a sequence of copy-to-reg nodes chained together with token chain
2555 // and flag operands which copy the outgoing args into the appropriate regs.
2557 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
2558 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
2559 RegsToPass[i].second, InFlag);
2560 InFlag = Chain.getValue(1);
2561 }
2562
2563 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
2564 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
2565 // node so that legalize doesn't hack it.
2566 bool isDirect = false;
2567
2569 const Module *Mod = MF.getFunction().getParent();
2570 const GlobalValue *GV = nullptr;
2572 GV = G->getGlobal();
2573 bool isStub =
2574 !TM.shouldAssumeDSOLocal(*Mod, GV) && Subtarget->isTargetMachO();
2575
2576 bool isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass());
2577 bool isLocalARMFunc = false;
2578 auto PtrVt = getPointerTy(DAG.getDataLayout());
2579
2580 if (Subtarget->genLongCalls()) {
2581 assert((!isPositionIndependent() || Subtarget->isTargetWindows()) &&
2582 "long-calls codegen is not position independent!");
2583 // Handle a global address or an external symbol. If it's not one of
2584 // those, the target's already in a register, so we don't need to do
2585 // anything extra.
2587 // Create a constant pool entry for the callee address
2588 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2590 ARMConstantPoolConstant::Create(GV, ARMPCLabelIndex, ARMCP::CPValue, 0);
2591
2592 // Get the address of the callee into a register
2595 Callee = DAG.getLoad(
2596 PtrVt, dl, DAG.getEntryNode(), CPAddr,
2599 const char *Sym = S->getSymbol();
2600
2601 // Create a constant pool entry for the callee address
2602 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2605 ARMPCLabelIndex, 0);
2606 // Get the address of the callee into a register
2609 Callee = DAG.getLoad(
2610 PtrVt, dl, DAG.getEntryNode(), CPAddr,
2612 }
2613 } else if (isa<GlobalAddressSDNode>(Callee)) {
2614 if (!PreferIndirect) {
2615 isDirect = true;
2616 bool isDef = GV->isStrongDefinitionForLinker();
2617
2618 // ARM call to a local ARM function is predicable.
2619 isLocalARMFunc = !Subtarget->isThumb() && (isDef || !ARMInterworking);
2620 // tBX takes a register source operand.
2621 if (isStub && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
2622 assert(Subtarget->isTargetMachO() && "WrapperPIC use on non-MachO?");
2623 Callee = DAG.getNode(
2626 Callee = DAG.getLoad(
2627 PtrVt, dl, DAG.getEntryNode(), Callee,
2631 } else if (Subtarget->isTargetCOFF()) {
2632 assert(Subtarget->isTargetWindows() &&
2633 "Windows is the only supported COFF target");
2634 unsigned TargetFlags = ARMII::MO_NO_FLAG;
2635 if (GV->hasDLLImportStorageClass())
2636 TargetFlags = ARMII::MO_DLLIMPORT;
2637 else if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV))
2638 TargetFlags = ARMII::MO_COFFSTUB;
2639 Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, /*offset=*/0,
2640 TargetFlags);
2641 if (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB))
2642 Callee =
2643 DAG.getLoad(PtrVt, dl, DAG.getEntryNode(),
2646 } else {
2647 Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, 0);
2648 }
2649 }
2651 isDirect = true;
2652 // tBX takes a register source operand.
2653 const char *Sym = S->getSymbol();
2654 if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
2655 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2658 ARMPCLabelIndex, 4);
2661 Callee = DAG.getLoad(
2662 PtrVt, dl, DAG.getEntryNode(), CPAddr,
2664 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
2666 } else {
2667 Callee = DAG.getTargetExternalSymbol(Sym, PtrVt, 0);
2668 }
2669 }
2670
2671 if (isCmseNSCall) {
2672 assert(!isARMFunc && !isDirect &&
2673 "Cannot handle call to ARM function or direct call");
2674 if (NumBytes > 0) {
2676 "call to non-secure function would "
2677 "require passing arguments on stack",
2678 dl.getDebugLoc());
2679 DAG.getContext()->diagnose(Diag);
2680 }
2681 if (isStructRet) {
2684 "call to non-secure function would return value through pointer",
2685 dl.getDebugLoc());
2686 DAG.getContext()->diagnose(Diag);
2687 }
2688 }
2689
2690 // FIXME: handle tail calls differently.
2691 unsigned CallOpc;
2692 if (Subtarget->isThumb()) {
2693 if (isCmseNSCall)
2695 else if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps())
2697 else
2699 } else {
2700 if (!isDirect && !Subtarget->hasV5TOps())
2702 else if (doesNotRet && isDirect && Subtarget->hasRetAddrStack() &&
2703 // Emit regular call when code size is the priority
2704 !Subtarget->hasMinSize())
2705 // "mov lr, pc; b _foo" to avoid confusing the RSP
2707 else
2709 }
2710
2711 // We don't usually want to end the call-sequence here because we would tidy
2712 // the frame up *after* the call, however in the ABI-changing tail-call case
2713 // we've carefully laid out the parameters so that when sp is reset they'll be
2714 // in the correct location.
2715 if (isTailCall && !isSibCall) {
2716 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
2717 DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
2718 InFlag = Chain.getValue(1);
2719 }
2720
2721 std::vector<SDValue> Ops;
2722 Ops.push_back(Chain);
2723 Ops.push_back(Callee);
2724
2725 if (isTailCall) {
2726 Ops.push_back(DAG.getTargetConstant(SPDiff, dl, MVT::i32));
2727 }
2728
2729 // Add argument registers to the end of the list so that they are known live
2730 // into the call.
2731 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
2732 Ops.push_back(DAG.getRegister(RegsToPass[i].first,
2733 RegsToPass[i].second.getValueType()));
2734
2735 // Add a register mask operand representing the call-preserved registers.
2736 if (!isTailCall) {
2737 const uint32_t *Mask;
2738 const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo();
2739 if (isThisReturn) {
2740 // For 'this' returns, use the R0-preserving mask if applicable
2741 Mask = ARI->getThisReturnPreservedMask(MF, CallConv);
2742 if (!Mask) {
2743 // Set isThisReturn to false if the calling convention is not one that
2744 // allows 'returned' to be modeled in this way, so LowerCallResult does
2745 // not try to pass 'this' straight through
2746 isThisReturn = false;
2747 Mask = ARI->getCallPreservedMask(MF, CallConv);
2748 }
2749 } else
2750 Mask = ARI->getCallPreservedMask(MF, CallConv);
2751
2752 assert(Mask && "Missing call preserved mask for calling convention");
2753 Ops.push_back(DAG.getRegisterMask(Mask));
2754 }
2755
2756 if (InFlag.getNode())
2757 Ops.push_back(InFlag);
2758
2760 if (isTailCall) {
2762 SDValue Ret = DAG.getNode(ARMISD::TC_RETURN, dl, NodeTys, Ops);
2763 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
2764 return Ret;
2765 }
2766
2767 // Returns a chain and a flag for retval copy to use.
2768 Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops);
2769 DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
2770 InFlag = Chain.getValue(1);
2771 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
2772
2773 // If we're guaranteeing tail-calls will be honoured, the callee must
2774 // pop its own argument stack on return. But this call is *not* a tail call so
2775 // we need to undo that after it returns to restore the status-quo.
2777 uint64_t CalleePopBytes =
2778 canGuaranteeTCO(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : -1ULL;
2779
2780 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, dl, true),
2781 DAG.getIntPtrConstant(CalleePopBytes, dl, true),
2782 InFlag, dl);
2783 if (!Ins.empty())
2784 InFlag = Chain.getValue(1);
2785
2786 // Handle result values, copying them out of physregs into vregs that we
2787 // return.
2788 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
2789 InVals, isThisReturn,
2790 isThisReturn ? OutVals[0] : SDValue());
2791}
2792
2793/// HandleByVal - Every parameter *after* a byval parameter is passed
2794/// on the stack. Remember the next parameter register to allocate,
2795/// and then confiscate the rest of the parameter registers to insure
2796/// this.
2797void ARMTargetLowering::HandleByVal(CCState *State, unsigned &Size,
2798 Align Alignment) const {
2799 // Byval (as with any stack) slots are always at least 4 byte aligned.
2800 Alignment = std::max(Alignment, Align(4));
2801
2802 unsigned Reg = State->AllocateReg(GPRArgRegs);
2803 if (!Reg)
2804 return;
2805
2806 unsigned AlignInRegs = Alignment.value() / 4;
2807 unsigned Waste = (ARM::R4 - Reg) % AlignInRegs;
2808 for (unsigned i = 0; i < Waste; ++i)
2809 Reg = State->AllocateReg(GPRArgRegs);
2810
2811 if (!Reg)
2812 return;
2813
2814 unsigned Excess = 4 * (ARM::R4 - Reg);
2815
2816 // Special case when NSAA != SP and parameter size greater than size of
2817 // all remained GPR regs. In that case we can't split parameter, we must
2818 // send it to stack. We also must set NCRN to R4, so waste all
2819 // remained registers.
2820 const unsigned NSAAOffset = State->getNextStackOffset();
2821 if (NSAAOffset != 0 && Size > Excess) {
2822 while (State->AllocateReg(GPRArgRegs))
2823 ;
2824 return;
2825 }
2826
2827 // First register for byval parameter is the first register that wasn't
2828 // allocated before this method call, so it would be "reg".
2829 // If parameter is small enough to be saved in range [reg, r4), then
2830 // the end (first after last) register would be reg + param-size-in-regs,
2831 // else parameter would be splitted between registers and stack,
2832 // end register would be r4 in this case.
2833 unsigned ByValRegBegin = Reg;
2834 unsigned ByValRegEnd = std::min<unsigned>(Reg + Size / 4, ARM::R4);
2835 State->addInRegsParamInfo(ByValRegBegin, ByValRegEnd);
2836 // Note, first register is allocated in the beginning of function already,
2837 // allocate remained amount of registers we need.
2838 for (unsigned i = Reg + 1; i != ByValRegEnd; ++i)
2839 State->AllocateReg(GPRArgRegs);
2840 // A byval parameter that is split between registers and memory needs its
2841 // size truncated here.
2842 // In the case where the entire structure fits in registers, we set the
2843 // size in memory to zero.
2844 Size = std::max<int>(Size - Excess, 0);
2845}
2846
2847/// MatchingStackOffset - Return true if the given stack call argument is
2848/// already available in the same position (relatively) of the caller's
2849/// incoming argument stack.
2850static
2853 const TargetInstrInfo *TII) {
2854 unsigned Bytes = Arg.getValueSizeInBits() / 8;
2855 int FI = std::numeric_limits<int>::max();
2856 if (Arg.getOpcode() == ISD::CopyFromReg) {
2857 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
2859 return false;
2860 MachineInstr *Def = MRI->getVRegDef(VR);
2861 if (!Def)
2862 return false;
2863 if (!Flags.isByVal()) {
2864 if (!TII->isLoadFromStackSlot(*Def, FI))
2865 return false;
2866 } else {
2867 return false;
2868 }
2869 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
2870 if (Flags.isByVal())
2871 // ByVal argument is passed in as a pointer but it's now being
2872 // dereferenced. e.g.
2873 // define @foo(%struct.X* %A) {
2874 // tail call @bar(%struct.X* byval %A)
2875 // }
2876 return false;
2877 SDValue Ptr = Ld->getBasePtr();
2879 if (!FINode)
2880 return false;
2881 FI = FINode->getIndex();
2882 } else
2883 return false;
2884
2885 assert(FI != std::numeric_limits<int>::max());
2886 if (!MFI.isFixedObjectIndex(FI))
2887 return false;
2888 return Offset == MFI.getObjectOffset(FI) && Bytes == MFI.getObjectSize(FI);
2889}
2890
2891/// IsEligibleForTailCallOptimization - Check whether the call is eligible
2892/// for tail call optimization. Targets which want to do tail call
2893/// optimization should implement this function.
2894bool ARMTargetLowering::IsEligibleForTailCallOptimization(
2895 SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
2898 const SmallVectorImpl<SDValue> &OutVals,
2900 const bool isIndirect) const {
2902 const Function &CallerF = MF.getFunction();
2903 CallingConv::ID CallerCC = CallerF.getCallingConv();
2904
2905 assert(Subtarget->supportsTailCall());
2906
2907 // Indirect tail calls cannot be optimized for Thumb1 if the args
2908 // to the call take up r0-r3. The reason is that there are no legal registers
2909 // left to hold the pointer to the function to be called.
2910 if (Subtarget->isThumb1Only() && Outs.size() >= 4 &&
2911 (!isa<GlobalAddressSDNode>(Callee.getNode()) || isIndirect))
2912 return false;
2913
2914 // Look for obvious safe cases to perform tail call optimization that do not
2915 // require ABI changes. This is what gcc calls sibcall.
2916
2917 // Exception-handling functions need a special set of instructions to indicate
2918 // a return to the hardware. Tail-calling another function would probably
2919 // break this.
2920 if (CallerF.hasFnAttribute("interrupt"))
2921 return false;
2922
2923 if (canGuaranteeTCO(CalleeCC, getTargetMachine().Options.GuaranteedTailCallOpt))
2924 return CalleeCC == CallerCC;
2925
2926 // Also avoid sibcall optimization if either caller or callee uses struct
2927 // return semantics.
2929 return false;
2930
2931 // Externally-defined functions with weak linkage should not be
2932 // tail-called on ARM when the OS does not support dynamic
2933 // pre-emption of symbols, as the AAELF spec requires normal calls
2934 // to undefined weak functions to be replaced with a NOP or jump to the
2935 // next instruction. The behaviour of branch instructions in this
2936 // situation (as used for tail calls) is implementation-defined, so we
2937 // cannot rely on the linker replacing the tail call with a return.
2939 const GlobalValue *GV = G->getGlobal();
2941 if (GV->hasExternalWeakLinkage() &&
2942 (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
2943 return false;
2944 }
2945
2946 // Check that the call results are passed in the same way.
2947 LLVMContext &C = *DAG.getContext();
2949 getEffectiveCallingConv(CalleeCC, isVarArg),
2950 getEffectiveCallingConv(CallerCC, CallerF.isVarArg()), MF, C, Ins,
2951 CCAssignFnForReturn(CalleeCC, isVarArg),
2952 CCAssignFnForReturn(CallerCC, CallerF.isVarArg())))
2953 return false;
2954 // The callee has to preserve all registers the caller needs to preserve.
2955 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
2956 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
2957 if (CalleeCC != CallerCC) {
2958 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
2959 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
2960 return false;
2961 }
2962
2963 // If Caller's vararg or byval argument has been split between registers and
2964 // stack, do not perform tail call, since part of the argument is in caller's
2965 // local frame.
2967 if (AFI_Caller->getArgRegsSaveSize())
2968 return false;
2969
2970 // If the callee takes no arguments then go on to check the results of the
2971 // call.
2972 if (!Outs.empty()) {
2973 // Check if stack adjustment is needed. For now, do not do this if any
2974 // argument is passed on the stack.
2976 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
2977 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg));
2978 if (CCInfo.getNextStackOffset()) {
2979 // Check if the arguments are already laid out in the right way as
2980 // the caller's fixed stack objects.
2981 MachineFrameInfo &MFI = MF.getFrameInfo();
2982 const MachineRegisterInfo *MRI = &MF.getRegInfo();
2983 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2984 for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
2985 i != e;
2986 ++i, ++realArgIdx) {
2987 CCValAssign &VA = ArgLocs[i];
2988 EVT RegVT = VA.getLocVT();
2989 SDValue Arg = OutVals[realArgIdx];
2990 ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
2991 if (VA.getLocInfo() == CCValAssign::Indirect)
2992 return false;
2993 if (VA.needsCustom() && (RegVT == MVT::f64 || RegVT == MVT::v2f64)) {
2994 // f64 and vector types are split into multiple registers or
2995 // register/stack-slot combinations. The types will not match
2996 // the registers; give up on memory f64 refs until we figure
2997 // out what to do about this.
2998 if (!VA.isRegLoc())
2999 return false;
3000 if (!ArgLocs[++i].isRegLoc())
3001 return false;
3002 if (RegVT == MVT::v2f64) {
3003 if (!ArgLocs[++i].isRegLoc())
3004 return false;
3005 if (!ArgLocs[++i].isRegLoc())
3006 return false;
3007 }
3008 } else if (!VA.isRegLoc()) {
3009 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
3010 MFI, MRI, TII))
3011 return false;
3012 }
3013 }
3014 }
3015
3016 const MachineRegisterInfo &MRI = MF.getRegInfo();
3018 return false;
3019 }
3020
3021 return true;
3022}
3023
3024bool
3025ARMTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
3026 MachineFunction &MF, bool isVarArg,
3028 LLVMContext &Context) const {
3030 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
3031 return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
3032}
3033
3035 const SDLoc &DL, SelectionDAG &DAG) {
3036 const MachineFunction &MF = DAG.getMachineFunction();
3037 const Function &F = MF.getFunction();
3038
3039 StringRef IntKind = F.getFnAttribute("interrupt").getValueAsString();
3040
3041 // See ARM ARM v7 B1.8.3. On exception entry LR is set to a possibly offset
3042 // version of the "preferred return address". These offsets affect the return
3043 // instruction if this is a return from PL1 without hypervisor extensions.
3044 // IRQ/FIQ: +4 "subs pc, lr, #4"
3045 // SWI: 0 "subs pc, lr, #0"
3046 // ABORT: +4 "subs pc, lr, #4"
3047 // UNDEF: +4/+2 "subs pc, lr, #0"
3048 // UNDEF varies depending on where the exception came from ARM or Thumb
3049 // mode. Alongside GCC, we throw our hands up in disgust and pretend it's 0.
3050
3051 int64_t LROffset;
3052 if (IntKind == "" || IntKind == "IRQ" || IntKind == "FIQ" ||
3053 IntKind == "ABORT")
3054 LROffset = 4;
3055 else if (IntKind == "SWI" || IntKind == "UNDEF")
3056 LROffset = 0;
3057 else
3058 report_fatal_error("Unsupported interrupt attribute. If present, value "
3059 "must be one of: IRQ, FIQ, SWI, ABORT or UNDEF");
3060
3061 RetOps.insert(RetOps.begin() + 1,
3062 DAG.getConstant(LROffset, DL, MVT::i32, false));
3063
3065}
3066
3067SDValue
3068ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
3069 bool isVarArg,
3071 const SmallVectorImpl<SDValue> &OutVals,
3072 const SDLoc &dl, SelectionDAG &DAG) const {
3073 // CCValAssign - represent the assignment of the return value to a location.
3075
3076 // CCState - Info about the registers and stack slots.
3077 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3078 *DAG.getContext());
3079
3080 // Analyze outgoing return values.
3081 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
3082
3083 SDValue Flag;
3085 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
3086 bool isLittleEndian = Subtarget->isLittle();
3087
3090 AFI->setReturnRegsCount(RVLocs.size());
3091
3092 // Report error if cmse entry function returns structure through first ptr arg.
3093 if (AFI->isCmseNSEntryFunction() && MF.getFunction().hasStructRetAttr()) {
3094 // Note: using an empty SDLoc(), as the first line of the function is a
3095 // better place to report than the last line.
3098 "secure entry function would return value through pointer",
3099 SDLoc().getDebugLoc());
3100 DAG.getContext()->diagnose(Diag);
3101 }
3102
3103 // Copy the result values into the output registers.
3104 for (unsigned i = 0, realRVLocIdx = 0;
3105 i != RVLocs.size();
3106 ++i, ++realRVLocIdx) {
3107 CCValAssign &VA = RVLocs[i];
3108 assert(VA.isRegLoc() && "Can only return in registers!");
3109
3110 SDValue Arg = OutVals[realRVLocIdx];
3111 bool ReturnF16 = false;
3112
3113 if (Subtarget->hasFullFP16() && Subtarget->isTargetHardFloat()) {
3114 // Half-precision return values can be returned like this:
3115 //
3116 // t11 f16 = fadd ...
3117 // t12: i16 = bitcast t11
3118 // t13: i32 = zero_extend t12
3119 // t14: f32 = bitcast t13 <~~~~~~~ Arg
3120 //
3121 // to avoid code generation for bitcasts, we simply set Arg to the node
3122 // that produces the f16 value, t11 in this case.
3123 //
3124 if (Arg.getValueType() == MVT::f32 && Arg.getOpcode() == ISD::BITCAST) {
3125 SDValue ZE = Arg.getOperand(0);
3126 if (ZE.getOpcode() == ISD::ZERO_EXTEND && ZE.getValueType() == MVT::i32) {
3127 SDValue BC = ZE.getOperand(0);
3128 if (BC.getOpcode() == ISD::BITCAST && BC.getValueType() == MVT::i16) {
3129 Arg = BC.getOperand(0);
3130 ReturnF16 = true;
3131 }
3132 }
3133 }
3134 }
3135
3136 switch (VA.getLocInfo()) {
3137 default: llvm_unreachable("Unknown loc info!");
3138 case CCValAssign::Full: break;
3139 case CCValAssign::BCvt:
3140 if (!ReturnF16)
3141 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
3142 break;
3143 }
3144
3145 // Mask f16 arguments if this is a CMSE nonsecure entry.
3146 auto RetVT = Outs[realRVLocIdx].ArgVT;
3147 if (AFI->isCmseNSEntryFunction() && (RetVT == MVT::f16)) {
3148 if (VA.needsCustom() && VA.getValVT() == MVT::f16) {
3149 Arg = MoveFromHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Arg);
3150 } else {
3151 auto LocBits = VA.getLocVT().getSizeInBits();
3152 auto MaskValue = APInt::getLowBitsSet(LocBits, RetVT.getSizeInBits());
3153 SDValue Mask =
3156 Arg = DAG.getNode(ISD::AND, dl, MVT::getIntegerVT(LocBits), Arg, Mask);
3157 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
3158 }
3159 }
3160
3161 if (VA.needsCustom() &&
3162 (VA.getLocVT() == MVT::v2f64 || VA.getLocVT() == MVT::f64)) {
3163 if (VA.getLocVT() == MVT::v2f64) {
3164 // Extract the first half and return it in two registers.
3166 DAG.getConstant(0, dl, MVT::i32));
3168 DAG.getVTList(MVT::i32, MVT::i32), Half);
3169
3170 Chain =
3171 DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3172 HalfGPRs.getValue(isLittleEndian ? 0 : 1), Flag);
3173 Flag = Chain.getValue(1);
3174 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3175 VA = RVLocs[++i]; // skip ahead to next loc
3176 Chain =
3177 DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3178 HalfGPRs.getValue(isLittleEndian ? 1 : 0), Flag);
3179 Flag = Chain.getValue(1);
3180 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3181 VA = RVLocs[++i]; // skip ahead to next loc
3182
3183 // Extract the 2nd half and fall through to handle it as an f64 value.
3185 DAG.getConstant(1, dl, MVT::i32));
3186 }
3187 // Legalize ret f64 -> ret 2 x i32. We always have fmrrd if f64 is
3188 // available.
3191 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3192 fmrrd.getValue(isLittleEndian ? 0 : 1), Flag);
3193 Flag = Chain.getValue(1);
3194 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3195 VA = RVLocs[++i]; // skip ahead to next loc
3196 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3197 fmrrd.getValue(isLittleEndian ? 1 : 0), Flag);
3198 } else
3199 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag);
3200
3201 // Guarantee that all emitted copies are
3202 // stuck together, avoiding something bad.
3203 Flag = Chain.getValue(1);
3204 RetOps.push_back(DAG.getRegister(
3205 VA.getLocReg(), ReturnF16 ? Arg.getValueType() : VA.getLocVT()));
3206 }
3207 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
3208 const MCPhysReg *I =
3209 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3210 if (I) {
3211 for (; *I; ++I) {
3212 if (ARM::GPRRegClass.contains(*I))
3213 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
3214 else if (ARM::DPRRegClass.contains(*I))
3215 RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
3216 else
3217 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3218 }
3219 }
3220
3221 // Update chain and glue.
3222 RetOps[0] = Chain;
3223 if (Flag.getNode())
3224 RetOps.push_back(Flag);
3225
3226 // CPUs which aren't M-class use a special sequence to return from
3227 // exceptions (roughly, any instruction setting pc and cpsr simultaneously,
3228 // though we use "subs pc, lr, #N").
3229 //
3230 // M-class CPUs actually use a normal return sequence with a special
3231 // (hardware-provided) value in LR, so the normal code path works.
3232 if (DAG.getMachineFunction().getFunction().hasFnAttribute("interrupt") &&
3233 !Subtarget->isMClass()) {
3234 if (Subtarget->isThumb1Only())
3235 report_fatal_error("interrupt attribute is not supported in Thumb1");
3236 return LowerInterruptReturn(RetOps, dl, DAG);
3237 }
3238
3241 return DAG.getNode(RetNode, dl, MVT::Other, RetOps);
3242}
3243
3244bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
3245 if (N->getNumValues() != 1)
3246 return false;
3247 if (!N->hasNUsesOfValue(1, 0))
3248 return false;
3249
3250 SDValue TCChain = Chain;
3251 SDNode *Copy = *N->use_begin();
3252 if (Copy->getOpcode() == ISD::CopyToReg) {
3253 // If the copy has a glue operand, we conservatively assume it isn't safe to
3254 // perform a tail call.
3255 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
3256 return false;
3257 TCChain = Copy->getOperand(0);
3258 } else if (Copy->getOpcode() == ARMISD::VMOVRRD) {
3259 SDNode *VMov = Copy;
3260 // f64 returned in a pair of GPRs.
3262 for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end();
3263 UI != UE; ++UI) {
3264 if (UI->getOpcode() != ISD::CopyToReg)
3265 return false;
3266 Copies.insert(*UI);
3267 }
3268 if (Copies.size() > 2)
3269 return false;
3270
3271 for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end();
3272 UI != UE; ++UI) {
3273 SDValue UseChain = UI->getOperand(0);
3274 if (Copies.count(UseChain.getNode()))
3275 // Second CopyToReg
3276 Copy = *UI;
3277 else {
3278 // We are at the top of this chain.
3279 // If the copy has a glue operand, we conservatively assume it
3280 // isn't safe to perform a tail call.
3281 if (UI->getOperand(UI->getNumOperands()-1).getValueType() == MVT::Glue)
3282 return false;
3283 // First CopyToReg
3284 TCChain = UseChain;
3285 }
3286 }
3287 } else if (Copy->getOpcode() == ISD::BITCAST) {
3288 // f32 returned in a single GPR.
3289 if (!Copy->hasOneUse())
3290 return false;
3291 Copy = *Copy->use_begin();
3292 if (Copy->getOpcode() != ISD::CopyToReg || !Copy->hasNUsesOfValue(1, 0))
3293 return false;
3294 // If the copy has a glue operand, we conservatively assume it isn't safe to
3295 // perform a tail call.
3296 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
3297 return false;
3298 TCChain = Copy->getOperand(0);
3299 } else {
3300 return false;
3301 }
3302
3303 bool HasRet = false;
3304 for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
3305 UI != UE; ++UI) {
3306 if (UI->getOpcode() != ARMISD::RET_FLAG &&
3307 UI->getOpcode() != ARMISD::INTRET_FLAG)
3308 return false;
3309 HasRet = true;
3310 }
3311
3312 if (!HasRet)
3313 return false;
3314
3315 Chain = TCChain;
3316 return true;
3317}
3318
3319bool ARMTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
3320 if (!Subtarget->supportsTailCall())
3321 return false;
3322
3323 if (!CI->isTailCall())
3324 return false;
3325
3326 return true;
3327}
3328
3329// Trying to write a 64 bit value so need to split into two 32 bit values first,
3330// and pass the lower and high parts through.
3332 SDLoc DL(Op);
3333 SDValue WriteValue = Op->getOperand(2);
3334
3335 // This function is only supposed to be called for i64 type argument.
3336 assert(WriteValue.getValueType() == MVT::i64
3337 && "LowerWRITE_REGISTER called for non-i64 type argument.");
3338
3340 DAG.getConstant(0, DL, MVT::i32));
3342 DAG.getConstant(1, DL, MVT::i32));
3343 SDValue Ops[] = { Op->getOperand(0), Op->getOperand(1), Lo, Hi };
3344 return DAG.getNode(ISD::WRITE_REGISTER, DL, MVT::Other, Ops);
3345}
3346
3347// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
3348// their target counterpart wrapped in the ARMISD::Wrapper node. Suppose N is
3349// one of the above mentioned nodes. It has to be wrapped because otherwise
3350// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
3351// be used to form addressing mode. These wrapped nodes will be selected
3352// into MOVi.
3353SDValue ARMTargetLowering::LowerConstantPool(SDValue Op,
3354 SelectionDAG &DAG) const {
3355 EVT PtrVT = Op.getValueType();
3356 // FIXME there is no actual debug info here
3357 SDLoc dl(Op);
3359 SDValue Res;
3360
3361 // When generating execute-only code Constant Pools must be promoted to the
3362 // global data section. It's a bit ugly that we can't share them across basic
3363 // blocks, but this way we guarantee that execute-only behaves correct with
3364 // position-independent addressing modes.
3365 if (Subtarget->genExecuteOnly()) {
3366 auto AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>();
3367 auto T = const_cast<Type*>(CP->getType());
3368 auto C = const_cast<Constant*>(CP->getConstVal());
3369 auto M = const_cast<Module*>(DAG.getMachineFunction().
3370 getFunction().getParent());
3371 auto GV = new GlobalVariable(
3372 *M, T, /*isConstant=*/true, GlobalVariable::InternalLinkage, C,
3375 Twine(AFI->createPICLabelUId())
3376 );
3378 dl, PtrVT);
3379 return LowerGlobalAddress(GA, DAG);
3380 }
3381
3382 if (CP->isMachineConstantPoolEntry())
3383 Res =
3384 DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT, CP->getAlign());
3385 else
3386 Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlign());
3387 return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Res);
3388}
3389
3393
3394SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op,
3395 SelectionDAG &DAG) const {
3398 unsigned ARMPCLabelIndex = 0;
3399 SDLoc DL(Op);
3401 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
3403 bool IsPositionIndependent = isPositionIndependent() || Subtarget->isROPI();
3404 if (!IsPositionIndependent) {
3405 CPAddr = DAG.getTargetConstantPool(BA, PtrVT, Align(4));
3406 } else {
3407 unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
3408 ARMPCLabelIndex = AFI->createPICLabelUId();
3410 ARMConstantPoolConstant::Create(BA, ARMPCLabelIndex,
3413 }
3415 SDValue Result = DAG.getLoad(
3416 PtrVT, DL, DAG.getEntryNode(), CPAddr,
3419 return Result;
3420 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, DL, MVT::i32);
3421 return DAG.getNode(ARMISD::PIC_ADD, DL, PtrVT, Result, PICLabel);
3422}
3423
3424/// Convert a TLS address reference into the correct sequence of loads
3425/// and calls to compute the variable's address for Darwin, and return an
3426/// SDValue containing the final node.
3427
3428/// Darwin only has one TLS scheme which must be capable of dealing with the
3429/// fully general situation, in the worst case. This means:
3430/// + "extern __thread" declaration.
3431/// + Defined in a possibly unknown dynamic library.
3432///
3433/// The general system is that each __thread variable has a [3 x i32] descriptor
3434/// which contains information used by the runtime to calculate the address. The
3435/// only part of this the compiler needs to know about is the first word, which
3436/// contains a function pointer that must be called with the address of the
3437/// entire descriptor in "r0".
3438///
3439/// Since this descriptor may be in a different unit, in general access must
3440/// proceed along the usual ARM rules. A common sequence to produce is:
3441///
3442/// movw rT1, :lower16:_var$non_lazy_ptr
3443/// movt rT1, :upper16:_var$non_lazy_ptr
3444/// ldr r0, [rT1]
3445/// ldr rT2, [r0]
3446/// blx rT2
3447/// [...address now in r0...]
3448SDValue
3449ARMTargetLowering::LowerGlobalTLSAddressDarwin(SDValue Op,
3450 SelectionDAG &DAG) const {
3451 assert(Subtarget->isTargetDarwin() &&
3452 "This function expects a Darwin target");
3453 SDLoc DL(Op);
3454
3455 // First step is to get the address of the actua global symbol. This is where
3456 // the TLS descriptor lives.
3457 SDValue DescAddr = LowerGlobalAddressDarwin(Op, DAG);
3458
3459 // The first entry in the descriptor is a function pointer that we must call
3460 // to obtain the address of the variable.
3461 SDValue Chain = DAG.getEntryNode();
3463 MVT::i32, DL, Chain, DescAddr,
3467 Chain = FuncTLVGet.getValue(1);
3468
3470 MachineFrameInfo &MFI = F.getFrameInfo();
3471 MFI.setAdjustsStack(true);
3472
3473 // TLS calls preserve all registers except those that absolutely must be
3474 // trashed: R0 (it takes an argument), LR (it's a call) and CPSR (let's not be
3475 // silly).
3476 auto TRI =
3477 getTargetMachine().getSubtargetImpl(F.getFunction())->getRegisterInfo();
3478 auto ARI = static_cast<const ARMRegisterInfo *>(TRI);
3479 const uint32_t *Mask = ARI->getTLSCallPreservedMask(DAG.getMachineFunction());
3480
3481 // Finally, we can make the call. This is just a degenerate version of a
3482 // normal AArch64 call node: r0 takes the address of the descriptor, and
3483 // returns the address of the variable in this thread.
3484 Chain = DAG.getCopyToReg(Chain, DL, ARM::R0, DescAddr, SDValue());
3485 Chain =
3487 Chain, FuncTLVGet, DAG.getRegister(ARM::R0, MVT::i32),
3488 DAG.getRegisterMask(Mask), Chain.getValue(1));
3489 return DAG.getCopyFromReg(Chain, DL, ARM::R0, MVT::i32, Chain.getValue(1));
3490}
3491
3492SDValue
3493ARMTargetLowering::LowerGlobalTLSAddressWindows(SDValue Op,
3494 SelectionDAG &DAG) const {
3495 assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");
3496
3497 SDValue Chain = DAG.getEntryNode();
3499 SDLoc DL(Op);
3500
3501 // Load the current TEB (thread environment block)
3502 SDValue Ops[] = {Chain,
3503 DAG.getTargetConstant(Intrinsic::arm_mrc, DL, MVT::i32),
3504 DAG.getTargetConstant(15, DL, MVT::i32),
3505 DAG.getTargetConstant(0, DL, MVT::i32),
3506 DAG.getTargetConstant(13, DL, MVT::i32),
3507 DAG.getTargetConstant(0, DL, MVT::i32),
3508 DAG.getTargetConstant(2, DL, MVT::i32)};
3510 DAG.getVTList(MVT::i32, MVT::Other), Ops);
3511
3512 SDValue TEB = CurrentTEB.getValue(0);
3513 Chain = CurrentTEB.getValue(1);
3514
3515 // Load the ThreadLocalStoragePointer from the TEB
3516 // A pointer to the TLS array is located at offset 0x2c from the TEB.
3518 DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x2c, DL));
3520
3521 // The pointer to the thread's TLS data area is at the TLS Index scaled by 4
3522 // offset into the TLSArray.
3523
3524 // Load the TLS index from the C runtime
3529
3531 DAG.getConstant(2, DL, MVT::i32));
3532 SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
3533 DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
3535
3536 // Get the offset of the start of the .tls section (section base)
3537 const auto *GA = cast<GlobalAddressSDNode>(Op);
3538 auto *CPV = ARMConstantPoolConstant::Create(GA->getGlobal(), ARMCP::SECREL);
3539 SDValue Offset = DAG.getLoad(
3540 PtrVT, DL, Chain,
3544
3545 return DAG.getNode(ISD::ADD, DL, PtrVT, TLS, Offset);
3546}
3547
3548// Lower ISD::GlobalTLSAddress using the "general dynamic" model
3549SDValue
3550ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
3551 SelectionDAG &DAG) const {
3552 SDLoc dl(GA);
3554 unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
3557 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
3559 ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
3563 Argument = DAG.getLoad(
3564 PtrVT, dl, DAG.getEntryNode(), Argument,
3566 SDValue Chain = Argument.getValue(1);
3567
3568 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
3570
3571 // call __tls_get_addr.
3572 ArgListTy Args;
3573 ArgListEntry Entry;
3574 Entry.Node = Argument;
3575 Entry.Ty = (Type *) Type::getInt32Ty(*DAG.getContext());
3576 Args.push_back(Entry);
3577
3578 // FIXME: is there useful debug info available here?
3580 CLI.setDebugLoc(dl).setChain(Chain).setLibCallee(
3582 DAG.getExternalSymbol("__tls_get_addr", PtrVT), std::move(Args));
3583
3584 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
3585 return CallResult.first;
3586}
3587
3588// Lower ISD::GlobalTLSAddress using the "initial exec" or
3589// "local exec" model.
3590SDValue
3591ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
3592 SelectionDAG &DAG,
3593 TLSModel::Model model) const {
3594 const GlobalValue *GV = GA->getGlobal();
3595 SDLoc dl(GA);
3597 SDValue Chain = DAG.getEntryNode();
3599 // Get the Thread Pointer
3601
3605 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
3606 // Initial exec model.
3607 unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
3609 ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
3611 true);
3614 Offset = DAG.getLoad(
3615 PtrVT, dl, Chain, Offset,
3617 Chain = Offset.getValue(1);
3618
3619 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
3621
3622 Offset = DAG.getLoad(
3623 PtrVT, dl, Chain, Offset,
3625 } else {
3626 // local exec model
3632 Offset = DAG.getLoad(
3633 PtrVT, dl, Chain, Offset,
3635 }
3636
3637 // The address of the thread local variable is the add of the thread
3638 // pointer with the offset of the variable.
3639 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
3640}
3641
3642SDValue
3643ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
3645 if (DAG.getTarget().useEmulatedTLS())
3646 return LowerToTLSEmulatedModel(GA, DAG);
3647
3648 if (Subtarget->isTargetDarwin())
3649 return LowerGlobalTLSAddressDarwin(Op, DAG);
3650
3651 if (Subtarget->isTargetWindows())
3652 return LowerGlobalTLSAddressWindows(Op, DAG);
3653
3654 // TODO: implement the "local dynamic" model
3655 assert(Subtarget->isTargetELF() && "Only ELF implemented here");
3657
3658 switch (model) {
3661 return LowerToTLSGeneralDynamicModel(GA, DAG);
3664 return LowerToTLSExecModels(GA, DAG, model);
3665 }
3666 llvm_unreachable("bogus TLS model");
3667}
3668
3669/// Return true if all users of V are within function F, looking through
3670/// ConstantExprs.
3671static bool allUsersAreInFunction(const Value *V, const Function *F) {
3672 SmallVector<const User*,4> Worklist(V->users());
3673 while (!Worklist.empty()) {
3674 auto *U = Worklist.pop_back_val();
3675 if (isa<ConstantExpr>(U)) {
3676 append_range(Worklist, U->users());
3677 continue;
3678 }
3679
3680 auto *I = dyn_cast<Instruction>(U);
3681 if (!I || I->getParent()->getParent() != F)
3682 return false;
3683 }
3684 return true;
3685}
3686
3688 const GlobalValue *GV, SelectionDAG &DAG,
3689 EVT PtrVT, const SDLoc &dl) {
3690 // If we're creating a pool entry for a constant global with unnamed address,
3691 // and the global is small enough, we can emit it inline into the constant pool
3692 // to save ourselves an indirection.
3693 //
3694 // This is a win if the constant is only used in one function (so it doesn't
3695 // need to be duplicated) or duplicating the constant wouldn't increase code
3696 // size (implying the constant is no larger than 4 bytes).
3697 const Function &F = DAG.getMachineFunction().getFunction();
3698
3699 // We rely on this decision to inline being idemopotent and unrelated to the
3700 // use-site. We know that if we inline a variable at one use site, we'll
3701 // inline it elsewhere too (and reuse the constant pool entry). Fast-isel
3702 // doesn't know about this optimization, so bail out if it's enabled else
3703 // we could decide to inline here (and thus never emit the GV) but require
3704 // the GV from fast-isel generated code.
3707 return SDValue();
3708
3709 auto *GVar = dyn_cast<GlobalVariable>(GV);
3710 if (!GVar || !GVar->hasInitializer() ||
3711 !GVar->isConstant() || !GVar->hasGlobalUnnamedAddr() ||
3712 !GVar->hasLocalLinkage())
3713 return SDValue();
3714
3715 // If we inline a value that contains relocations, we move the relocations
3716 // from .data to .text. This is not allowed in position-independent code.
3717 auto *Init = GVar->getInitializer();
3718 if ((TLI->isPositionIndependent() || TLI->getSubtarget()->isROPI()) &&
3719 Init->needsDynamicRelocation())
3720 return SDValue();
3721
3722 // The constant islands pass can only really deal with alignment requests
3723 // <= 4 bytes and cannot pad constants itself. Therefore we cannot promote
3724 // any type wanting greater alignment requirements than 4 bytes. We also
3725 // can only promote constants that are multiples of 4 bytes in size or
3726 // are paddable to a multiple of 4. Currently we only try and pad constants
3727 // that are strings for simplicity.
3729 unsigned Size = DAG.getDataLayout().getTypeAllocSize(Init->getType());
3730 Align PrefAlign = DAG.getDataLayout().getPreferredAlign(GVar);
3731 unsigned RequiredPadding = 4 - (Size % 4);
3732 bool PaddingPossible =
3733 RequiredPadding == 4 || (CDAInit && CDAInit->isString());
3734 if (!PaddingPossible || PrefAlign > 4 || Size > ConstpoolPromotionMaxSize ||
3735 Size == 0)
3736 return SDValue();
3737
3738 unsigned PaddedSize = Size + ((RequiredPadding == 4) ? 0 : RequiredPadding);
3740 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3741
3742 // We can't bloat the constant pool too much, else the ConstantIslands pass
3743 // may fail to converge. If we haven't promoted this global yet (it may have
3744 // multiple uses), and promoting it would increase the constant pool size (Sz
3745 // > 4), ensure we have space to do so up to MaxTotal.
3746 if (!AFI->getGlobalsPromotedToConstantPool().count(GVar) && Size > 4)
3747 if (AFI->getPromotedConstpoolIncrease() + PaddedSize - 4 >=
3749 return SDValue();
3750
3751 // This is only valid if all users are in a single function; we can't clone
3752 // the constant in general. The LLVM IR unnamed_addr allows merging
3753 // constants, but not cloning them.
3754 //
3755 // We could potentially allow cloning if we could prove all uses of the
3756 // constant in the current function don't care about the address, like
3757 // printf format strings. But that isn't implemented for now.
3759 return SDValue();
3760
3761 // We're going to inline this global. Pad it out if needed.
3762 if (RequiredPadding != 4) {
3763 StringRef S = CDAInit->getAsString();
3764
3766 std::copy(S.bytes_begin(), S.bytes_end(), V.begin());
3767 while (RequiredPadding--)
3768 V.push_back(0);
3770 }
3771
3774 if (!AFI->getGlobalsPromotedToConstantPool().count(GVar)) {
3777 PaddedSize - 4);
3778 }
3780 return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3781}
3782
3784 if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
3785 if (!(GV = GA->getBaseObject()))
3786 return false;
3787 if (const auto *V = dyn_cast<GlobalVariable>(GV))
3788 return V->isConstant();
3789 return isa<Function>(GV);
3790}
3791
3792SDValue ARMTargetLowering::LowerGlobalAddress(SDValue Op,
3793 SelectionDAG &DAG) const {
3794 switch (Subtarget->getTargetTriple().getObjectFormat()) {
3795 default: llvm_unreachable("unknown object format");
3796 case Triple::COFF:
3797 return LowerGlobalAddressWindows(Op, DAG);
3798 case Triple::ELF:
3799 return LowerGlobalAddressELF(Op, DAG);
3800 case Triple::MachO:
3801 return LowerGlobalAddressDarwin(Op, DAG);
3802 }
3803}
3804
3805SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
3806 SelectionDAG &DAG) const {
3808 SDLoc dl(Op);
3809 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
3811 bool IsRO = isReadOnly(GV);
3812
3813 // promoteToConstantPool only if not generating XO text section
3814 if (TM.shouldAssumeDSOLocal(*GV->getParent(), GV) && !Subtarget->genExecuteOnly())
3815 if (SDValue V = promoteToConstantPool(this, GV, DAG, PtrVT, dl))
3816 return V;
3817
3818 if (isPositionIndependent()) {
3819 bool UseGOT_PREL = !TM.shouldAssumeDSOLocal(*GV->getParent(), GV);
3820 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
3823 if (UseGOT_PREL)
3824 Result =
3825 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
3827 return Result;
3828 } else if (Subtarget->isROPI() && IsRO) {
3829 // PC-relative.
3830 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT);
3832 return Result;
3833 } else if (Subtarget->isRWPI() && !IsRO) {
3834 // SB-relative.
3836 if (Subtarget->useMovt()) {
3837 ++NumMovwMovt;
3839 RelAddr = DAG.getNode(ARMISD::Wrapper, dl, PtrVT, G);
3840 } else { // use literal pool for address constant
3845 RelAddr = DAG.getLoad(
3846 PtrVT, dl, DAG.getEntryNode(), CPAddr,
3848 }
3849 SDValue SB = DAG.getCopyFromReg(DAG.getEntryNode(), dl, ARM::R9, PtrVT);
3850 SDValue Result = DAG.getNode(ISD::ADD, dl, PtrVT, SB, RelAddr);
3851 return Result;
3852 }
3853
3854 // If we have T2 ops, we can materialize the address directly via movt/movw
3855 // pair. This is always cheaper.
3856 if (Subtarget->useMovt()) {
3857 ++NumMovwMovt;
3858 // FIXME: Once remat is capable of dealing with instructions with register
3859 // operands, expand this into two nodes.
3860 return DAG.getNode(ARMISD::Wrapper, dl, PtrVT,
3861 DAG.getTargetGlobalAddress(GV, dl, PtrVT));
3862 } else {
3865 return DAG.getLoad(
3866 PtrVT, dl, DAG.getEntryNode(), CPAddr,
3868 }
3869}
3870
3871SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op,
3872 SelectionDAG &DAG) const {
3873 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
3874 "ROPI/RWPI not currently supported for Darwin");
3876 SDLoc dl(Op);
3877 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
3878
3879 if (Subtarget->useMovt())
3880 ++NumMovwMovt;
3881
3882 // FIXME: Once remat is capable of dealing with instructions with register
3883 // operands, expand this into multiple nodes
3884 unsigned Wrapper =
3886
3888 SDValue Result = DAG.getNode(Wrapper, dl, PtrVT, G);
3889
3890 if (Subtarget->isGVIndirectSymbol(GV))
3891 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
3893 return Result;
3894}
3895
3896SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op,
3897 SelectionDAG &DAG) const {
3898 assert(Subtarget->isTargetWindows() && "non-Windows COFF is not supported");
3899 assert(Subtarget->useMovt() &&
3900 "Windows on ARM expects to use movw/movt");
3901 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
3902 "ROPI/RWPI not currently supported for Windows");
3903
3905 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
3906 ARMII::TOF TargetFlags = ARMII::MO_NO_FLAG;
3907 if (GV->hasDLLImportStorageClass())
3908 TargetFlags = ARMII::MO_DLLIMPORT;
3909 else if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV))
3910 TargetFlags = ARMII::MO_COFFSTUB;
3913 SDLoc DL(Op);
3914
3915 ++NumMovwMovt;
3916
3917 // FIXME: Once remat is capable of dealing with instructions with register
3918 // operands, expand this into two nodes.
3920 DAG.getTargetGlobalAddress(GV, DL, PtrVT, /*offset=*/0,
3921 TargetFlags));
3922 if (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB))
3923 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
3925 return Result;
3926}
3927
3928SDValue
3929ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const {
3930 SDLoc dl(Op);
3931 SDValue Val = DAG.getConstant(0, dl, MVT::i32);
3932 return DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl,
3933 DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0),
3934 Op.getOperand(1), Val);
3935}
3936
3937SDValue
3938ARMTargetLowering::LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const {
3939 SDLoc dl(Op);
3940 return DAG.getNode(ARMISD::EH_SJLJ_LONGJMP, dl, MVT::Other, Op.getOperand(0),
3941 Op.getOperand(1), DAG.getConstant(0, dl, MVT::i32));
3942}
3943
3944SDValue ARMTargetLowering::LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
3945 SelectionDAG &DAG) const {
3946 SDLoc dl(Op);
3948 Op.getOperand(0));
3949}
3950
3951SDValue ARMTargetLowering::LowerINTRINSIC_VOID(
3952 SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget) const {
3953 unsigned IntNo =
3955 Op.getOperand(Op.getOperand(0).getValueType() == MVT::Other))
3956 ->getZExtValue();
3957 switch (IntNo) {
3958 default:
3959 return SDValue(); // Don't custom lower most intrinsics.
3960 case Intrinsic::arm_gnu_eabi_mcount: {
3963 SDLoc dl(Op);
3964 SDValue Chain = Op.getOperand(0);
3965 // call "\01__gnu_mcount_nc"
3966 const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo();
3967 const uint32_t *Mask =
3968 ARI->getCallPreservedMask(DAG.getMachineFunction(), CallingConv::C);
3969 assert(Mask && "Missing call preserved mask for calling convention");
3970 // Mark LR an implicit live-in.
3971 unsigned Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32));
3973 DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, PtrVT);
3974 constexpr EVT ResultTys[] = {MVT::Other, MVT::Glue};
3975 SDValue Callee =
3976 DAG.getTargetExternalSymbol("\01__gnu_mcount_nc", PtrVT, 0);
3978 if (Subtarget->isThumb())
3979 return SDValue(
3980 DAG.getMachineNode(
3981 ARM::tBL_PUSHLR, dl, ResultTys,
3982 {ReturnAddress, DAG.getTargetConstant(ARMCC::AL, dl, PtrVT),
3983 DAG.getRegister(0, PtrVT), Callee, RegisterMask, Chain}),
3984 0);
3985 return SDValue(
3986 DAG.getMachineNode(ARM::BL_PUSHLR, dl, ResultTys,
3987 {ReturnAddress, Callee, RegisterMask, Chain}),
3988 0);
3989 }
3990 }
3991}
3992
3993SDValue
3994ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
3995 const ARMSubtarget *Subtarget) const {
3996 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
3997 SDLoc dl(Op);
3998 switch (IntNo) {
3999 default: return SDValue(); // Don't custom lower most intrinsics.
4000 case Intrinsic::thread_pointer: {
4002 return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
4003 }
4004 case Intrinsic::arm_cls: {
4005 const SDValue &Operand = Op.getOperand(1);
4006 const EVT VTy = Op.getValueType();
4007 SDValue SRA =
4008 DAG.getNode(ISD::SRA, dl, VTy, Operand, DAG.getConstant(31, dl, VTy));
4009 SDValue XOR = DAG.getNode(ISD::XOR, dl, VTy, SRA, Operand);
4010 SDValue SHL =
4011 DAG.getNode(ISD::SHL, dl, VTy, XOR, DAG.getConstant(1, dl, VTy));
4012 SDValue OR =
4013 DAG.getNode(ISD::OR, dl, VTy, SHL, DAG.getConstant(1, dl, VTy));
4014 SDValue Result = DAG.getNode(ISD::CTLZ, dl, VTy, OR);
4015 return Result;
4016 }
4017 case Intrinsic::arm_cls64: {
4018 // cls(x) = if cls(hi(x)) != 31 then cls(hi(x))
4019 // else 31 + clz(if hi(x) == 0 then lo(x) else not(lo(x)))
4020 const SDValue &Operand = Op.getOperand(1);
4021 const EVT VTy = Op.getValueType();
4022
4023 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VTy, Operand,
4024 DAG.getConstant(1, dl, VTy));
4025 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VTy, Operand,
4026 DAG.getConstant(0, dl, VTy));
4027 SDValue Constant0 = DAG.getConstant(0, dl, VTy);
4028 SDValue Constant1 = DAG.getConstant(1, dl, VTy);
4029 SDValue Constant31 = DAG.getConstant(31, dl, VTy);
4030 SDValue SRAHi = DAG.getNode(ISD::SRA, dl, VTy, Hi, Constant31);
4031 SDValue XORHi = DAG.getNode(ISD::XOR, dl, VTy, SRAHi, Hi);
4032 SDValue SHLHi = DAG.getNode(ISD::SHL, dl, VTy, XORHi, Constant1);
4033 SDValue ORHi = DAG.getNode(ISD::OR, dl, VTy, SHLHi, Constant1);
4034 SDValue CLSHi = DAG.getNode(ISD::CTLZ, dl, VTy, ORHi);
4040 DAG.getSelect(dl, VTy, HiIsZero, Lo, DAG.getNOT(dl, Lo, VTy));
4042 SDValue Result =
4043 DAG.getSelect(dl, VTy, CheckLo,
4044 DAG.getNode(ISD::ADD, dl, VTy, CLZAdjustedLo, Constant31), CLSHi);
4045 return Result;
4046 }
4047 case Intrinsic::eh_sjlj_lsda: {
4050 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
4054 unsigned PCAdj = IsPositionIndependent ? (Subtarget->isThumb() ? 4 : 8) : 0;
4056 ARMConstantPoolConstant::Create(&MF.getFunction(), ARMPCLabelIndex,
4060 SDValue Result = DAG.getLoad(
4061 PtrVT, dl, DAG.getEntryNode(), CPAddr,
4063
4065 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
4066 Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel);
4067 }
4068 return Result;
4069 }
4070 case Intrinsic::arm_neon_vabs:
4071 return DAG.getNode(ISD::ABS, SDLoc(Op), Op.getValueType(),
4072 Op.getOperand(1));
4073 case Intrinsic::arm_neon_vmulls:
4074 case Intrinsic::arm_neon_vmullu: {
4075 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmulls)
4076 ? ARMISD::VMULLs : ARMISD::VMULLu;
4077 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
4078 Op.getOperand(1), Op.getOperand(2));
4079 }
4080 case Intrinsic::arm_neon_vminnm:
4081 case Intrinsic::arm_neon_vmaxnm: {
4082 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminnm)
4083 ? ISD::FMINNUM : ISD::FMAXNUM;
4084 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
4085 Op.getOperand(1), Op.getOperand(2));
4086 }
4087 case Intrinsic::arm_neon_vminu:
4088 case Intrinsic::arm_neon_vmaxu: {
4089 if (Op.getValueType().isFloatingPoint())
4090 return SDValue();
4091 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminu)
4092 ? ISD::UMIN : ISD::UMAX;
4093 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
4094 Op.getOperand(1), Op.getOperand(2));
4095 }
4096 case Intrinsic::arm_neon_vmins:
4097 case Intrinsic::arm_neon_vmaxs: {
4098 // v{min,max}s is overloaded between signed integers and floats.
4099 if (!Op.getValueType().isFloatingPoint()) {
4100 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins)
4101 ? ISD::SMIN : ISD::SMAX;
4102 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
4103 Op.getOperand(1), Op.getOperand(2));
4104 }
4105 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins)
4106 ? ISD::FMINIMUM : ISD::FMAXIMUM;
4107 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
4108 Op.getOperand(1), Op.getOperand(2));
4109 }
4110 case Intrinsic::arm_neon_vtbl1:
4111 return DAG.getNode(ARMISD::VTBL1, SDLoc(Op), Op.getValueType(),
4112 Op.getOperand(1), Op.getOperand(2));
4113 case Intrinsic::arm_neon_vtbl2:
4114 return DAG.getNode(ARMISD::VTBL2, SDLoc(Op), Op.getValueType(),
4115 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
4116 case Intrinsic::arm_mve_pred_i2v:
4117 case Intrinsic::arm_mve_pred_v2i:
4118 return DAG.getNode(ARMISD::PREDICATE_CAST, SDLoc(Op), Op.getValueType(),
4119 Op.getOperand(1));
4120 case Intrinsic::arm_mve_vreinterpretq:
4121 return DAG.getNode(ARMISD::VECTOR_REG_CAST, SDLoc(Op), Op.getValueType(),
4122 Op.getOperand(1));
4123 case Intrinsic::arm_mve_lsll:
4124 return DAG.getNode(ARMISD::LSLL, SDLoc(Op), Op->getVTList(),
4125 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
4126 case Intrinsic::arm_mve_asrl:
4127 return DAG.getNode(ARMISD::ASRL, SDLoc(Op), Op->getVTList(),
4128 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
4129 }
4130}
4131
4133 const ARMSubtarget *Subtarget) {
4134 SDLoc dl(Op);
4135 ConstantSDNode *SSIDNode = cast<ConstantSDNode>(Op.getOperand(2));
4136 auto SSID = static_cast<SyncScope::ID>(SSIDNode->getZExtValue());
4137 if (SSID == SyncScope::SingleThread)
4138 return Op;
4139
4140 if (!Subtarget->hasDataBarrier()) {
4141 // Some ARMv6 cpus can support data barriers with an mcr instruction.
4142 // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
4143 // here.
4144 assert(Subtarget->hasV6Ops() && !Subtarget->isThumb() &&
4145 "Unexpected ISD::ATOMIC_FENCE encountered. Should be libcall!");
4146 return DAG.getNode(ARMISD::MEMBARRIER_MCR, dl, MVT::Other, Op.getOperand(0),
4147 DAG.getConstant(0, dl, MVT::i32));
4148 }
4149
4150 ConstantSDNode *OrdN = cast<ConstantSDNode>(Op.getOperand(1));
4151 AtomicOrdering Ord = static_cast<AtomicOrdering>(OrdN->getZExtValue());
4153 if (Subtarget->isMClass()) {
4154 // Only a full system barrier exists in the M-class architectures.
4156 } else if (Subtarget->preferISHSTBarriers() &&
4157 Ord == AtomicOrdering::Release) {
4158 // Swift happens to implement ISHST barriers in a way that's compatible with
4159 // Release semantics but weaker than ISH so we'd be fools not to use
4160 // it. Beware: other processors probably don't!
4162 }
4163
4164 return DAG.getNode(ISD::INTRINSIC_VOID, dl, MVT::Other, Op.getOperand(0),
4165 DAG.getConstant(Intrinsic::arm_dmb, dl, MVT::i32),
4166 DAG.getConstant(Domain, dl, MVT::i32));
4167}
4168
4170 const ARMSubtarget *Subtarget) {
4171 // ARM pre v5TE and Thumb1 does not have preload instructions.
4172 if (!(Subtarget->isThumb2() ||
4173 (!Subtarget->isThumb1Only() && Subtarget->hasV5TEOps())))
4174 // Just preserve the chain.
4175 return Op.getOperand(0);
4176
4177 SDLoc dl(Op);
4178 unsigned isRead = ~cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue() & 1;
4179 if (!isRead &&
4180 (!Subtarget->hasV7Ops() || !Subtarget->hasMPExtension()))
4181 // ARMv7 with MP extension has PLDW.
4182 return Op.getOperand(0);
4183
4184 unsigned isData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
4185 if (Subtarget->isThumb()) {
4186 // Invert the bits.
4187 isRead = ~isRead & 1;
4188 isData = ~isData & 1;
4189 }
4190
4191 return DAG.getNode(ARMISD::PRELOAD, dl, MVT::Other, Op.getOperand(0),
4192 Op.getOperand(1), DAG.getConstant(isRead, dl, MVT::i32),
4193 DAG.getConstant(isData, dl, MVT::i32));
4194}
4195
4198 ARMFunctionInfo *FuncInfo = MF.getInfo<ARMFunctionInfo>();
4199
4200 // vastart just stores the address of the VarArgsFrameIndex slot into the
4201 // memory location argument.
4202 SDLoc dl(Op);
4205 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
4206 return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1),
4207 MachinePointerInfo(SV));
4208}
4209
4210SDValue ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA,
4212 SDValue &Root,
4213 SelectionDAG &DAG,
4214 const SDLoc &dl) const {
4217
4218 const TargetRegisterClass *RC;
4219 if (AFI->isThumb1OnlyFunction())
4220 RC = &ARM::tGPRRegClass;
4221 else
4222 RC = &ARM::GPRRegClass;
4223
4224 // Transform the arguments stored in physical registers into virtual ones.
4225 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
4226 SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
4227
4229 if (NextVA.isMemLoc()) {
4230 MachineFrameInfo &MFI = MF.getFrameInfo();
4231 int FI = MFI.CreateFixedObject(4, NextVA.getLocMemOffset(), true);
4232
4233 // Create load node to retrieve arguments from the stack.
4235 ArgValue2 = DAG.getLoad(
4236 MVT::i32, dl, Root, FIN,
4238 } else {
4239 Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
4240 ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
4241 }
4242 if (!Subtarget->isLittle())
4243 std::swap (ArgValue, ArgValue2);
4244 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, ArgValue, ArgValue2);
4245}
4246
4247// The remaining GPRs hold either the beginning of variable-argument
4248// data, or the beginning of an aggregate passed by value (usually
4249// byval). Either way, we allocate stack slots adjacent to the data
4250// provided by our caller, and store the unallocated registers there.
4251// If this is a variadic function, the va_list pointer will begin with
4252// these values; otherwise, this reassembles a (byval) structure that
4253// was split between registers and memory.
4254// Return: The frame index registers were stored into.
4255int ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG,
4256 const SDLoc &dl, SDValue &Chain,
4257 const Value *OrigArg,
4258 unsigned InRegsParamRecordIdx,
4259 int ArgOffset, unsigned ArgSize) const {
4260 // Currently, two use-cases possible:
4261 // Case #1. Non-var-args function, and we meet first byval parameter.
4262 // Setup first unallocated register as first byval register;
4263 // eat all remained registers
4264 // (these two actions are performed by HandleByVal method).
4265 // Then, here, we initialize stack frame with
4266 // "store-reg" instructions.
4267 // Case #2. Var-args function, that doesn't contain byval parameters.
4268 // The same: eat all remained unallocated registers,
4269 // initialize stack frame.
4270
4272 MachineFrameInfo &MFI = MF.getFrameInfo();
4274 unsigned RBegin, REnd;
4277 } else {
4278 unsigned RBeginIdx = CCInfo.getFirstUnallocated(GPRArgRegs);
4279 RBegin = RBeginIdx == 4 ? (unsigned)ARM::R4 : GPRArgRegs[RBeginIdx];
4280 REnd = ARM::R4;
4281 }
4282
4283 if (REnd != RBegin)
4284 ArgOffset = -4 * (ARM::R4 - RBegin);
4285
4286 auto PtrVT = getPointerTy(DAG.getDataLayout());
4287 int FrameIndex = MFI.CreateFixedObject(ArgSize, ArgOffset, false);
4288 SDValue FIN = DAG.getFrameIndex(FrameIndex, PtrVT);
4289
4291 const TargetRegisterClass *RC =
4292 AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass : &ARM::GPRRegClass;
4293
4294 for (unsigned Reg = RBegin, i = 0; Reg < REnd; ++Reg, ++i) {
4295 unsigned VReg = MF.addLiveIn(Reg, RC);
4296 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
4297 SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
4299 MemOps.push_back(Store);
4300 FIN = DAG.getNode(ISD::ADD, dl, PtrVT, FIN, DAG.getConstant(4, dl, PtrVT));
4301 }
4302
4303 if (!MemOps.empty())
4304 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
4305 return FrameIndex;
4306}
4307
4308// Setup stack frame, the va_list pointer will start from.
4309void ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
4310 const SDLoc &dl, SDValue &Chain,
4311 unsigned ArgOffset,
4312 unsigned TotalArgRegsSaveSize,
4313 bool ForceMutable) const {
4316
4317 // Try to store any remaining integer argument regs
4318 // to their spots on the stack so that they may be loaded by dereferencing
4319 // the result of va_next.
4320 // If there is no regs to be stored, just point address after last
4321 // argument passed via stack.
4322 int FrameIndex = StoreByValRegs(CCInfo, DAG, dl, Chain, nullptr,
4323 CCInfo.getInRegsParamsCount(),
4324 CCInfo.getNextStackOffset(),
4325 std::max(4U, TotalArgRegsSaveSize));
4326 AFI->setVarArgsFrameIndex(FrameIndex);
4327}
4328
4329bool ARMTargetLowering::splitValueIntoRegisterParts(
4330 SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts,
4331 unsigned NumParts, MVT PartVT, Optional<CallingConv::ID> CC) const {
4332 bool IsABIRegCopy = CC.hasValue();
4333 EVT ValueVT = Val.getValueType();
4334 if (IsABIRegCopy && (ValueVT == MVT::f16 || ValueVT == MVT::bf16) &&
4335 PartVT == MVT::f32) {
4336 unsigned ValueBits = ValueVT.getSizeInBits();
4337 unsigned PartBits = PartVT.getSizeInBits();
4340 Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val);
4341 Parts[0] = Val;
4342 return true;
4343 }
4344 return false;
4345}
4346
4347SDValue ARMTargetLowering::joinRegisterPartsIntoValue(
4348 SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts, unsigned NumParts,
4350 bool IsABIRegCopy = CC.hasValue();
4351 if (IsABIRegCopy && (ValueVT == MVT::f16 || ValueVT == MVT::bf16) &&
4352 PartVT == MVT::f32) {
4353 unsigned ValueBits = ValueVT.getSizeInBits();
4354 unsigned PartBits = PartVT.getSizeInBits();
4355 SDValue Val = Parts[0];
4356
4359 Val = DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);
4360 return Val;
4361 }
4362 return SDValue();
4363}
4364
4365SDValue ARMTargetLowering::LowerFormalArguments(
4366 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
4367 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
4368 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
4370 MachineFrameInfo &MFI = MF.getFrameInfo();
4371
4373
4374 // Assign locations to all of the incoming arguments.
4376 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
4377 *DAG.getContext());
4378 CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForCall(CallConv, isVarArg));
4379
4381 SDValue ArgValue;
4383 unsigned CurArgIdx = 0;
4384
4385 // Initially ArgRegsSaveSize is zero.
4386 // Then we increase this value each time we meet byval parameter.
4387 // We also increase this value in case of varargs function.
4388 AFI->setArgRegsSaveSize(0);
4389
4390 // Calculate the amount of stack space that we need to allocate to store
4391 // byval and variadic arguments that are passed in registers.
4392 // We need to know this before we allocate the first byval or variadic
4393 // argument, as they will be allocated a stack slot below the CFA (Canonical
4394 // Frame Address, the stack pointer at entry to the function).
4395 unsigned ArgRegBegin = ARM::R4;
4396 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4397 if (CCInfo.getInRegsParamsProcessed() >= CCInfo.getInRegsParamsCount())
4398 break;
4399
4400 CCValAssign &VA = ArgLocs[i];
4401 unsigned Index = VA.getValNo();
4402 ISD::ArgFlagsTy Flags = Ins[Index].Flags;
4403 if (!Flags.isByVal())
4404 continue;
4405
4406 assert(VA.isMemLoc() && "unexpected byval pointer in reg");
4407 unsigned RBegin, REnd;
4409 ArgRegBegin = std::min(ArgRegBegin, RBegin);
4410
4411 CCInfo.nextInRegsParam();
4412 }
4413 CCInfo.rewindByValRegsInfo();
4414
4415 int lastInsIndex = -1;
4416 if (isVarArg && MFI.hasVAStart()) {
4417 unsigned RegIdx = CCInfo.getFirstUnallocated(GPRArgRegs);
4418 if (RegIdx != array_lengthof(GPRArgRegs))
4419 ArgRegBegin = std::min(ArgRegBegin, (unsigned)GPRArgRegs[RegIdx]);
4420 }
4421
4422 unsigned TotalArgRegsSaveSize = 4 * (ARM::R4 - ArgRegBegin);
4424 auto PtrVT = getPointerTy(DAG.getDataLayout());
4425
4426 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4427 CCValAssign &VA = ArgLocs[i];
4428 if (Ins[VA.getValNo()].isOrigArg()) {
4429 std::advance(CurOrigArg,
4430 Ins[VA.getValNo()].getOrigArgIndex() - CurArgIdx);
4431 CurArgIdx = Ins[VA.getValNo()].getOrigArgIndex();
4432 }
4433 // Arguments stored in registers.
4434 if (VA.isRegLoc()) {
4435 EVT RegVT = VA.getLocVT();
4436
4437 if (VA.needsCustom() && VA.getLocVT() == MVT::v2f64) {
4438 // f64 and vector types are split up into multiple registers or
4439 // combinations of registers and stack slots.
4441 GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
4442 VA = ArgLocs[++i]; // skip ahead to next loc
4444 if (VA.isMemLoc()) {
4445 int FI = MFI.CreateFixedObject(8, VA.getLocMemOffset(), true);
4446 SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
4447 ArgValue2 = DAG.getLoad(
4448 MVT::f64, dl, Chain, FIN,
4450 } else {
4451 ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
4452 }
4453 ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
4454 ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, ArgValue,
4455 ArgValue1, DAG.getIntPtrConstant(0, dl));
4456 ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, ArgValue,
4457 ArgValue2, DAG.getIntPtrConstant(1, dl));
4458 } else if (VA.needsCustom() && VA.getLocVT() == MVT::f64) {
4459 ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
4460 } else {
4461 const TargetRegisterClass *RC;
4462
4463 if (RegVT == MVT::f16 || RegVT == MVT::bf16)
4464 RC = &ARM::HPRRegClass;
4465 else if (RegVT == MVT::f32)
4466 RC = &ARM::SPRRegClass;
4467 else if (RegVT == MVT::f64 || RegVT == MVT::v4f16 ||
4468 RegVT == MVT::v4bf16)
4469 RC = &ARM::DPRRegClass;
4470 else if (RegVT == MVT::v2f64 || RegVT == MVT::v8f16 ||
4471 RegVT == MVT::v8bf16)
4472 RC = &ARM::QPRRegClass;
4473 else if (RegVT == MVT::i32)
4474 RC = AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass
4475 : &ARM::GPRRegClass;
4476 else
4477 llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
4478
4479 // Transform the arguments in physical registers into virtual ones.
4480 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
4481 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
4482
4483 // If this value is passed in r0 and has the returned attribute (e.g.
4484 // C++ 'structors), record this fact for later use.
4485 if (VA.getLocReg() == ARM::R0 && Ins[VA.getValNo()].Flags.isReturned()) {
4486 AFI->setPreservesR0();
4487 }
4488 }
4489
4490 // If this is an 8 or 16-bit value, it is really passed promoted
4491 // to 32 bits. Insert an assert[sz]ext to capture this, then
4492 // truncate to the right size.
4493 switch (VA.getLocInfo()) {
4494 default: llvm_unreachable("Unknown loc info!");
4495 case CCValAssign::Full: break;
4496 case CCValAssign::BCvt:
4497 ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
4498 break;
4499 case CCValAssign::SExt:
4500 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
4501 DAG.getValueType(VA.getValVT()));
4502 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
4503 break;
4504 case CCValAssign::ZExt:
4505 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
4506 DAG.getValueType(VA.getValVT()));
4507 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
4508 break;
4509 }
4510
4511 // f16 arguments have their size extended to 4 bytes and passed as if they
4512 // had been copied to the LSBs of a 32-bit register.
4513 // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
4514 if (VA.needsCustom() &&
4515 (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16))
4516 ArgValue = MoveToHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), ArgValue);
4517
4518 InVals.push_back(ArgValue);
4519 } else { // VA.isRegLoc()
4520 // sanity check
4521 assert(VA.isMemLoc());
4522 assert(VA.getValVT() != MVT::i64 && "i64 should already be lowered");
4523
4524 int index = VA.getValNo();
4525
4526 // Some Ins[] entries become multiple ArgLoc[] entries.
4527 // Process them only once.
4528 if (index != lastInsIndex)
4529 {
4530 ISD::ArgFlagsTy Flags = Ins[index].Flags;
4531 // FIXME: For now, all byval parameter objects are marked mutable.
4532 // This can be changed with more analysis.
4533 // In case of tail call optimization mark all arguments mutable.
4534 // Since they could be overwritten by lowering of arguments in case of
4535 // a tail call.
4536 if (Flags.isByVal()) {
4537 assert(Ins[index].isOrigArg() &&
4538 "Byval arguments cannot be implicit");
4539 unsigned CurByValIndex = CCInfo.getInRegsParamsProcessed();
4540
4541 int FrameIndex = StoreByValRegs(
4542 CCInfo, DAG, dl, Chain, &*CurOrigArg, CurByValIndex,
4543 VA.getLocMemOffset(), Flags.getByValSize());
4544 InVals.push_back(DAG.getFrameIndex(FrameIndex, PtrVT));
4545 CCInfo.nextInRegsParam();
4546 } else {
4547 unsigned FIOffset = VA.getLocMemOffset();
4548 int FI = MFI.CreateFixedObject(VA.getLocVT().getSizeInBits()/8,
4549 FIOffset, true);
4550
4551 // Create load nodes to retrieve arguments from the stack.
4552 SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
4553 InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN,
4555 DAG.getMachineFunction(), FI)));
4556 }
4557 lastInsIndex = index;
4558 }
4559 }
4560 }
4561
4562 // varargs
4563 if (isVarArg && MFI.hasVAStart()) {
4564 VarArgStyleRegisters(CCInfo, DAG, dl, Chain, CCInfo.getNextStackOffset(),
4566 if (AFI->isCmseNSEntryFunction()) {
4569 "secure entry function must not be variadic", dl.getDebugLoc());
4570 DAG.getContext()->diagnose(Diag);
4571 }
4572 }
4573
4574 unsigned StackArgSize = CCInfo.getNextStackOffset();
4576 if (canGuaranteeTCO(CallConv, TailCallOpt)) {
4577 // The only way to guarantee a tail call is if the callee restores its
4578 // argument area, but it must also keep the stack aligned when doing so.
4579 const DataLayout &DL = DAG.getDataLayout();
4580 StackArgSize = alignTo(StackArgSize, DL.getStackAlignment());
4581
4583 }
4585
4586 if (CCInfo.getNextStackOffset() > 0 && AFI->isCmseNSEntryFunction()) {
4589 "secure entry function requires arguments on stack", dl.getDebugLoc());
4590 DAG.getContext()->diagnose(Diag);
4591 }
4592
4593 return Chain;
4594}
4595
4596/// isFloatingPointZero - Return true if this is +0.0.
4599 return CFP->getValueAPF().isPosZero();
4600 else if (ISD::isEXTLoad(Op.getNode()) || ISD::isNON_EXTLoad(Op.getNode())) {
4601 // Maybe this has already been legalized into the constant pool?
4602 if (Op.getOperand(1).getOpcode() == ARMISD::Wrapper) {
4603 SDValue WrapperOp = Op.getOperand(1).getOperand(0);
4605 if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal()))
4606 return CFP->getValueAPF().isPosZero();
4607 }
4608 } else if (Op->getOpcode() == ISD::BITCAST &&
4609 Op->getValueType(0) == MVT::f64) {
4610 // Handle (ISD::BITCAST (ARMISD::VMOVIMM (ISD::TargetConstant 0)) MVT::f64)
4611 // created by LowerConstantFP().
4612 SDValue BitcastOp = Op->getOperand(0);
4613 if (BitcastOp->getOpcode() == ARMISD::VMOVIMM &&
4614 isNullConstant(BitcastOp->getOperand(0)))
4615 return true;
4616 }
4617 return false;
4618}
4619
4620/// Returns appropriate ARM CMP (cmp) and corresponding condition code for
4621/// the given operands.
4622SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
4623 SDValue &ARMcc, SelectionDAG &DAG,
4624 const SDLoc &dl) const {
4625 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
4626 unsigned C = RHSC->getZExtValue();
4627 if (!isLegalICmpImmediate((int32_t)C)) {
4628 // Constant does not fit, try adjusting it by one.
4629 switch (CC) {
4630 default: break;
4631 case ISD::SETLT:
4632 case ISD::SETGE:
4633 if (C != 0x80000000 && isLegalICmpImmediate(C-1)) {
4634 CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
4635 RHS = DAG.getConstant(C - 1, dl, MVT::i32);
4636 }
4637 break;
4638 case ISD::SETULT:
4639 case ISD::SETUGE:
4640 if (C != 0 && isLegalICmpImmediate(C-1)) {
4641 CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
4642 RHS = DAG.getConstant(C - 1, dl, MVT::i32);
4643 }
4644 break;
4645 case ISD::SETLE:
4646 case ISD::SETGT:
4647 if (C != 0x7fffffff && isLegalICmpImmediate(C+1)) {
4648 CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
4649 RHS = DAG.getConstant(C + 1, dl, MVT::i32);
4650 }
4651 break;
4652 case ISD::SETULE:
4653 case ISD::SETUGT:
4654 if (C != 0xffffffff && isLegalICmpImmediate(C+1)) {
4655 CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
4656 RHS = DAG.getConstant(C + 1, dl, MVT::i32);
4657 }
4658 break;
4659 }
4660 }
4661 } else if ((ARM_AM::getShiftOpcForNode(LHS.getOpcode()) != ARM_AM::no_shift) &&
4662 (ARM_AM::getShiftOpcForNode(RHS.getOpcode()) == ARM_AM::no_shift)) {
4663 // In ARM and Thumb-2, the compare instructions can shift their second
4664 // operand.
4666 std::swap(LHS, RHS);
4667 }
4668
4669 // Thumb1 has very limited immediate modes, so turning an "and" into a
4670 // shift can save multiple instructions.
4671 //
4672 // If we have (x & C1), and C1 is an appropriate mask, we can transform it
4673 // into "((x << n) >> n)". But that isn't necessarily profitable on its
4674 // own. If it's the operand to an unsigned comparison with an immediate,
4675 // we can eliminate one of the shifts: we transform
4676 // "((x << n) >> n) == C2" to "(x << n) == (C2 << n)".
4677 //
4678 // We avoid transforming cases which aren't profitable due to encoding
4679 // details:
4680 //
4681 // 1. C2 fits into the immediate field of a cmp, and the transformed version
4682 // would not; in that case, we're essentially trading one immediate load for
4683 // another.
4684 // 2. C1 is 255 or 65535, so we can use uxtb or uxth.
4685 // 3. C2 is zero; we have other code for this special case.
4686 //
4687 // FIXME: Figure out profitability for Thumb2; we usually can't save an
4688 // instruction, since the AND is always one instruction anyway, but we could
4689 // use narrow instructions in some cases.
4690 if (Subtarget->isThumb1Only() && LHS->getOpcode() == ISD::AND &&
4691 LHS->hasOneUse() && isa<ConstantSDNode>(LHS.getOperand(1)) &&
4692 LHS.getValueType() == MVT::i32 && isa<ConstantSDNode>(RHS) &&
4693 !isSignedIntSetCC(CC)) {
4694 unsigned Mask = cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue();
4695 auto *RHSC = cast<ConstantSDNode>(RHS.getNode());
4696 uint64_t RHSV = RHSC->getZExtValue();
4697 if (isMask_32(Mask) && (RHSV & ~Mask) == 0 && Mask != 255 && Mask != 65535) {
4698 unsigned ShiftBits = countLeadingZeros(Mask);
4699 if (RHSV && (RHSV > 255 || (RHSV << ShiftBits) <= 255)) {
4700 SDValue ShiftAmt = DAG.getConstant(ShiftBits, dl, MVT::i32);
4701 LHS = DAG.getNode(ISD::SHL, dl, MVT::i32, LHS.getOperand(0), ShiftAmt);
4702 RHS = DAG.getConstant(RHSV << ShiftBits, dl, MVT::i32);
4703 }
4704 }
4705 }
4706
4707 // The specific comparison "(x<<c) > 0x80000000U" can be optimized to a
4708 // single "lsls x, c+1". The shift sets the "C" and "Z" flags the same
4709 // way a cmp would.
4710 // FIXME: Add support for ARM/Thumb2; this would need isel patterns, and
4711 // some tweaks to the heuristics for the previous and->shift transform.
4712 // FIXME: Optimize cases where the LHS isn't a shift.
4713 if (Subtarget->isThumb1Only() && LHS->getOpcode() == ISD::SHL &&
4714 isa<ConstantSDNode>(RHS) &&
4715 cast<ConstantSDNode>(RHS)->getZExtValue() == 0x80000000U &&
4716 CC == ISD::SETUGT && isa<ConstantSDNode>(LHS.getOperand(1)) &&
4717 cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue() < 31) {
4718 unsigned ShiftAmt =
4719 cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue() + 1;
4720 SDValue Shift = DAG.getNode(ARMISD::LSLS, dl,
4722 LHS.getOperand(0),
4723 DAG.getConstant(ShiftAmt, dl, MVT::i32));
4724 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, ARM::CPSR,
4725 Shift.getValue(1), SDValue());
4726 ARMcc = DAG.getConstant(ARMCC::HI, dl, MVT::i32);
4727 return Chain.getValue(1);
4728 }
4729
4730 ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
4731
4732 // If the RHS is a constant zero then the V (overflow) flag will never be
4733 // set. This can allow us to simplify GE to PL or LT to MI, which can be
4734 // simpler for other passes (like the peephole optimiser) to deal with.
4735 if (isNullConstant(RHS)) {
4736 switch (CondCode) {
4737 default: break;
4738 case ARMCC::GE:
4739 CondCode = ARMCC::PL;
4740 break;
4741 case ARMCC::LT:
4742 CondCode = ARMCC::MI;
4743 break;
4744 }
4745 }
4746
4748 switch (CondCode) {
4749 default:
4751 break;
4752 case ARMCC::EQ:
4753 case ARMCC::NE:
4754 // Uses only Z Flag
4756 break;
4757 }
4758 ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
4759 return DAG.getNode(CompareType, dl, MVT::Glue, LHS, RHS);
4760}
4761
4762/// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands.
4763SDValue ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS,
4764 SelectionDAG &DAG, const SDLoc &dl,
4765 bool Signaling) const {
4766 assert(Subtarget->hasFP64() || RHS.getValueType() != MVT::f64);
4767 SDValue Cmp;
4768 if (!isFloatingPointZero(RHS))
4769 Cmp = DAG.getNode(Signaling ? ARMISD::CMPFPE : ARMISD::CMPFP,
4770 dl, MVT::Glue, LHS, RHS);
4771 else
4772 Cmp = DAG.getNode(Signaling ? ARMISD::CMPFPEw0 : ARMISD::CMPFPw0,
4773 dl, MVT::Glue, LHS);
4774 return DAG.getNode(ARMISD::FMSTAT, dl, MVT::Glue, Cmp);
4775}
4776
4777/// duplicateCmp - Glue values can have only one use, so this function
4778/// duplicates a comparison node.
4779SDValue
4780ARMTargetLowering::duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const {
4781 unsigned Opc = Cmp.getOpcode();
4782 SDLoc DL(Cmp);
4783 if (Opc == ARMISD::CMP || Opc == ARMISD::CMPZ)
4784 return DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1));
4785
4786 assert(Opc == ARMISD::FMSTAT && "unexpected comparison operation");
4787 Cmp = Cmp.getOperand(0);
4788 Opc = Cmp.getOpcode();
4789 if (Opc == ARMISD::CMPFP)
4790 Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1));
4791 else {
4792 assert(Opc == ARMISD::CMPFPw0 && "unexpected operand of FMSTAT");
4793 Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0));
4794 }
4795 return DAG.getNode(ARMISD::FMSTAT, DL, MVT::Glue, Cmp);
4796}
4797
4798// This function returns three things: the arithmetic computation itself
4799// (Value), a comparison (OverflowCmp), and a condition code (ARMcc). The
4800// comparison and the condition code define the case in which the arithmetic
4801// computation *does not* overflow.
4802std::pair<SDValue, SDValue>
4803ARMTargetLowering::getARMXALUOOp(SDValue Op, SelectionDAG &DAG,
4804 SDValue &ARMcc) const {
4805 assert(Op.getValueType() == MVT::i32 && "Unsupported value type");
4806
4808 SDValue LHS = Op.getOperand(0);
4809 SDValue RHS = Op.getOperand(1);
4810 SDLoc dl(Op);
4811
4812 // FIXME: We are currently always generating CMPs because we don't support
4813 // generating CMN through the backend. This is not as good as the natural
4814 // CMP case because it causes a register dependency and cannot be folded
4815 // later.
4816
4817 switch (Op.getOpcode()) {
4818 default:
4819 llvm_unreachable("Unknown overflow instruction!");
4820 case ISD::SADDO:
4821 ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32);
4822 Value = DAG.getNode(ISD::ADD, dl, Op.getValueType(), LHS, RHS);
4823 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS);
4824 break;
4825 case ISD::UADDO:
4826 ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32);
4827 // We use ADDC here to correspond to its use in LowerUnsignedALUO.
4828 // We do not use it in the USUBO case as Value may not be used.
4829 Value = DAG.getNode(ARMISD::ADDC, dl,
4830 DAG.getVTList(Op.getValueType(), MVT::i32), LHS, RHS)
4831 .getValue(0);
4832 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS);
4833 break;
4834 case ISD::SSUBO:
4835 ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32);
4836 Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS);
4837 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS);
4838 break;
4839 case ISD::USUBO:
4840 ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32);
4841 Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS);
4842 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS);
4843 break;
4844 case ISD::UMULO:
4845 // We generate a UMUL_LOHI and then check if the high word is 0.
4846 ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32);
4847 Value = DAG.getNode(ISD::UMUL_LOHI, dl,
4848 DAG.getVTList(Op.getValueType(), Op.getValueType()),
4849 LHS, RHS);
4850 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value.getValue(1),
4851 DAG.getConstant(0, dl, MVT::i32));
4852 Value = Value.getValue(0); // We only want the low 32 bits for the result.
4853 break;
4854 case ISD::SMULO:
4855 // We generate a SMUL_LOHI and then check if all the bits of the high word
4856 // are the same as the sign bit of the low word.
4857 ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32);
4858 Value = DAG.getNode(ISD::SMUL_LOHI, dl,
4859 DAG.getVTList(Op.getValueType(), Op.getValueType()),
4860 LHS, RHS);
4861 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value.getValue(1),
4862 DAG.getNode(ISD::SRA, dl, Op.getValueType(),
4863 Value.getValue(0),
4864 DAG.getConstant(31, dl, MVT::i32)));
4865 Value = Value.getValue(0); // We only want the low 32 bits for the result.
4866 break;
4867 } // switch (...)
4868
4869 return std::make_pair(Value, OverflowCmp);
4870}
4871
4872SDValue
4873ARMTargetLowering::LowerSignedALUO(SDValue Op, SelectionDAG &DAG) const {
4874 // Let legalize expand this if it isn't a legal type yet.
4875 if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
4876 return SDValue();
4877
4879 SDValue ARMcc;
4880 std::tie(Value, OverflowCmp) = getARMXALUOOp(Op, DAG, ARMcc);
4881 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
4882 SDLoc dl(Op);
4883 // We use 0 and 1 as false and true values.
4884 SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
4885 SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
4886 EVT VT = Op.getValueType();
4887
4888 SDValue Overflow = DAG.getNode(ARMISD::CMOV, dl, VT, TVal, FVal,
4890
4891 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
4892 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
4893}
4894
4896 SelectionDAG &DAG) {
4898 EVT CarryVT = BoolCarry.getValueType();
4899
4900 // This converts the boolean value carry into the carry flag by doing
4901 // ARMISD::SUBC Carry, 1
4904 BoolCarry, DAG.getConstant(1, DL, CarryVT));
4905 return Carry.getValue(1);
4906}
4907
4909 SelectionDAG &DAG) {
4910 SDLoc DL(Flags);
4911
4912 // Now convert the carry flag into a boolean carry. We do this
4913 // using ARMISD:ADDE 0, 0, Carry
4914 return DAG.getNode(ARMISD::ADDE, DL, DAG.getVTList(VT, MVT::i32),
4915 DAG.getConstant(0, DL, MVT::i32),
4916 DAG.getConstant(0, DL, MVT::i32), Flags);
4917}
4918
4919SDValue ARMTargetLowering::LowerUnsignedALUO(SDValue Op,
4920 SelectionDAG &DAG) const {
4921 // Let legalize expand this if it isn't a legal type yet.
4922 if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
4923 return SDValue();
4924
4925 SDValue LHS = Op.getOperand(0);
4926 SDValue RHS = Op.getOperand(1);
4927 SDLoc dl(Op);
4928
4929 EVT VT = Op.getValueType();
4930 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
4931 SDValue Value;
4933 switch (Op.getOpcode()) {
4934 default:
4935 llvm_unreachable("Unknown overflow instruction!");
4936 case ISD::UADDO:
4937 Value = DAG.getNode(ARMISD::ADDC, dl, VTs, LHS, RHS);
4938 // Convert the carry flag into a boolean value.
4939 Overflow = ConvertCarryFlagToBooleanCarry(Value.getValue(1), VT, DAG);
4940 break;
4941 case ISD::USUBO: {
4942 Value = DAG.getNode(ARMISD::SUBC, dl, VTs, LHS, RHS);
4943 // Convert the carry flag into a boolean value.
4944 Overflow = ConvertCarryFlagToBooleanCarry(Value.getValue(1), VT, DAG);
4945 // ARMISD::SUBC returns 0 when we have to borrow, so make it an overflow
4946 // value. So compute 1 - C.
4947 Overflow = DAG.getNode(ISD::SUB, dl, MVT::i32,
4948 DAG.getConstant(1, dl, MVT::i32), Overflow);
4949 break;
4950 }
4951 }
4952
4953 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
4954}
4955
4957 const ARMSubtarget *Subtarget) {
4958 EVT VT = Op.getValueType();
4959 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP())
4960 return SDValue();
4961 if (!VT.isSimple())
4962 return SDValue();
4963
4964 unsigned NewOpcode;
4965 switch (VT.getSimpleVT().SimpleTy) {
4966 default:
4967 return SDValue();
4968 case MVT::i8:
4969 switch (Op->getOpcode()) {
4970 case ISD::UADDSAT:
4972 break;
4973 case ISD::SADDSAT:
4975 break;
4976 case ISD::USUBSAT:
4978 break;
4979 case ISD::SSUBSAT:
4981 break;
4982 }
4983 break;
4984 case MVT::i16:
4985 switch (Op->getOpcode()) {
4986 case ISD::UADDSAT:
4988 break;
4989 case ISD::SADDSAT:
4991 break;
4992 case ISD::USUBSAT:
4994 break;
4995 case ISD::SSUBSAT:
4997 break;
4998 }
4999 break;
5000 }
5001
5002 SDLoc dl(Op);
5003 SDValue Add =
5004 DAG.getNode(NewOpcode, dl, MVT::i32,
5005 DAG.getSExtOrTrunc(Op->getOperand(0), dl, MVT::i32),
5006 DAG.getSExtOrTrunc(Op->getOperand(1), dl, MVT::i32));
5007 return DAG.getNode(ISD::TRUNCATE, dl, VT, Add);
5008}
5009
5010SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
5011 SDValue Cond = Op.getOperand(0);
5012 SDValue SelectTrue = Op.getOperand(1);
5013 SDValue SelectFalse = Op.getOperand(2);
5014 SDLoc dl(Op);
5015 unsigned Opc = Cond.getOpcode();
5016
5017 if (Cond.getResNo() == 1 &&
5018 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
5019 Opc == ISD::USUBO)) {
5020 if (!DAG.getTargetLoweringInfo().isTypeLegal(Cond->getValueType(0)))
5021 return SDValue();
5022
5024 SDValue ARMcc;
5025 std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc);
5026 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
5027 EVT VT = Op.getValueType();
5028
5029 return getCMOV(dl, VT, SelectTrue, SelectFalse, ARMcc, CCR,
5030 OverflowCmp, DAG);
5031 }
5032
5033 // Convert:
5034 //
5035 // (select (cmov 1, 0, cond), t, f) -> (cmov t, f, cond)
5036 // (select (cmov 0, 1, cond), t, f) -> (cmov f, t, cond)
5037 //
5038 if (Cond.getOpcode() == ARMISD::CMOV && Cond.hasOneUse()) {
5039 const ConstantSDNode *CMOVTrue =
5040 dyn_cast<ConstantSDNode>(Cond.getOperand(0));
5041 const ConstantSDNode *CMOVFalse =
5042 dyn_cast<ConstantSDNode>(Cond.getOperand(1));
5043
5044 if (CMOVTrue && CMOVFalse) {
5045 unsigned CMOVTrueVal = CMOVTrue->getZExtValue();
5046 unsigned CMOVFalseVal = CMOVFalse->getZExtValue();
5047
5048 SDValue True;
5049 SDValue False;
5050 if (CMOVTrueVal == 1 && CMOVFalseVal == 0) {
5051 True = SelectTrue;
5052 False = SelectFalse;
5053 } else if (CMOVTrueVal == 0 && CMOVFalseVal == 1) {
5054 True = SelectFalse;
5055 False = SelectTrue;
5056 }
5057
5058 if (True.getNode() && False.getNode()) {
5059 EVT VT = Op.getValueType();
5060 SDValue ARMcc = Cond.getOperand(2);
5061 SDValue CCR = Cond.getOperand(3);
5062 SDValue Cmp = duplicateCmp(Cond.getOperand(4), DAG);
5063 assert(True.getValueType() == VT);
5064 return getCMOV(dl, VT, True, False, ARMcc, CCR, Cmp, DAG);
5065 }
5066 }
5067 }
5068
5069 // ARM's BooleanContents value is UndefinedBooleanContent. Mask out the
5070 // undefined bits before doing a full-word comparison with zero.
5071 Cond = DAG.getNode(ISD::AND, dl, Cond.getValueType(), Cond,
5072 DAG.getConstant(1, dl, Cond.getValueType()));
5073
5074 return DAG.getSelectCC(dl, Cond,
5075 DAG.getConstant(0, dl, Cond.getValueType()),
5077}
5078
5080 bool &swpCmpOps, bool &swpVselOps) {
5081 // Start by selecting the GE condition code for opcodes that return true for
5082 // 'equality'
5083 if (CC == ISD::SETUGE || CC == ISD::SETOGE || CC == ISD::SETOLE ||
5084 CC == ISD::SETULE || CC == ISD::SETGE || CC == ISD::SETLE)
5085 CondCode = ARMCC::GE;
5086
5087 // and GT for opcodes that return false for 'equality'.
5088 else if (CC == ISD::SETUGT || CC == ISD::SETOGT || CC == ISD::SETOLT ||
5089 CC == ISD::SETULT || CC == ISD::SETGT || CC == ISD::SETLT)
5090 CondCode = ARMCC::GT;
5091
5092 // Since we are constrained to GE/GT, if the opcode contains 'less', we need
5093 // to swap the compare operands.
5094 if (CC == ISD::SETOLE || CC == ISD::SETULE || CC == ISD::SETOLT ||
5095 CC == ISD::SETULT || CC == ISD::SETLE || CC == ISD::SETLT)
5096 swpCmpOps = true;
5097
5098 // Both GT and GE are ordered comparisons, and return false for 'unordered'.
5099 // If we have an unordered opcode, we need to swap the operands to the VSEL
5100 // instruction (effectively negating the condition).
5101 //
5102 // This also has the effect of swapping which one of 'less' or 'greater'
5103 // returns true, so we also swap the compare operands. It also switches
5104 // whether we return true for 'equality', so we compensate by picking the
5105 // opposite condition code to our original choice.
5106 if (CC == ISD::SETULE || CC == ISD::SETULT || CC == ISD::SETUGE ||
5107 CC == ISD::SETUGT) {
5110 CondCode = CondCode == ARMCC::GT ? ARMCC::GE : ARMCC::GT;
5111 }
5112
5113 // 'ordered' is 'anything but unordered', so use the VS condition code and
5114 // swap the VSEL operands.
5115 if (CC == ISD::SETO) {
5116 CondCode = ARMCC::VS;
5117 swpVselOps = true;
5118 }
5119
5120 // 'unordered or not equal' is 'anything but equal', so use the EQ condition
5121 // code and swap the VSEL operands. Also do this if we don't care about the
5122 // unordered case.
5123 if (CC == ISD::SETUNE || CC == ISD::SETNE) {
5124 CondCode = ARMCC::EQ;
5125 swpVselOps = true;
5126 }
5127}
5128
5129SDValue ARMTargetLowering::getCMOV(const SDLoc &dl, EVT VT, SDValue FalseVal,
5130 SDValue TrueVal, SDValue ARMcc, SDValue CCR,
5131 SDValue Cmp, SelectionDAG &DAG) const {
5132 if (!Subtarget->hasFP64() && VT == MVT::f64) {
5134 DAG.getVTList(MVT::i32, MVT::i32), FalseVal);
5136 DAG.getVTList(MVT::i32, MVT::i32), TrueVal);
5137
5138 SDValue TrueLow = TrueVal.getValue(0);
5139 SDValue TrueHigh = TrueVal.getValue(1);
5140 SDValue FalseLow = FalseVal.getValue(0);
5141 SDValue FalseHigh = FalseVal.getValue(1);
5142
5144 ARMcc, CCR, Cmp);
5146 ARMcc, CCR, duplicateCmp(Cmp, DAG));
5147
5148 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Low, High);
5149 } else {
5150 return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, CCR,
5151 Cmp);
5152 }
5153}
5154
5155static bool isGTorGE(ISD::CondCode CC) {
5156 return CC == ISD::SETGT || CC == ISD::SETGE;
5157}
5158
5159static bool isLTorLE(ISD::CondCode CC) {
5160 return CC == ISD::SETLT || CC == ISD::SETLE;
5161}
5162
5163// See if a conditional (LHS CC RHS ? TrueVal : FalseVal) is lower-saturating.
5164// All of these conditions (and their <= and >= counterparts) will do:
5165// x < k ? k : x
5166// x > k ? x : k
5167// k < x ? x : k
5168// k > x ? k : x
5169static bool isLowerSaturate(const SDValue LHS, const SDValue RHS,
5170 const SDValue TrueVal, const SDValue FalseVal,
5171 const ISD::CondCode CC, const SDValue K) {
5172 return (isGTorGE(CC) &&
5173 ((K == LHS && K == TrueVal) || (K == RHS && K == FalseVal))) ||
5174 (isLTorLE(CC) &&
5175 ((K == RHS && K == TrueVal) || (K == LHS && K == FalseVal)));
5176}
5177
5178// Check if two chained conditionals could be converted into SSAT or USAT.
5179//
5180// SSAT can replace a set of two conditional selectors that bound a number to an
5181// interval of type [k, ~k] when k + 1 is a power of 2. Here are some examples:
5182//
5183// x < -k ? -k : (x > k ? k : x)
5184// x < -k ? -k : (x < k ? x : k)
5185// x > -k ? (x > k ? k : x) : -k
5186// x < k ? (x < -k ? -k : x) : k
5187// etc.
5188//
5189// LLVM canonicalizes these to either a min(max()) or a max(min())
5190// pattern. This function tries to match one of these and will return a SSAT
5191// node if successful.
5192//
5193// USAT works similarily to SSAT but bounds on the interval [0, k] where k + 1
5194// is a power of 2.
5196 EVT VT = Op.getValueType();
5197 SDValue V1 = Op.getOperand(0);
5198 SDValue K1 = Op.getOperand(1);
5199 SDValue TrueVal1 = Op.getOperand(2);
5200 SDValue FalseVal1 = Op.getOperand(3);
5201 ISD::CondCode CC1 = cast<CondCodeSDNode>(Op.getOperand(4))->get();
5202
5204 if (Op2.getOpcode() != ISD::SELECT_CC)
5205 return SDValue();
5206
5207 SDValue V2 = Op2.getOperand(0);
5208 SDValue K2 = Op2.getOperand(1);
5209 SDValue TrueVal2 = Op2.getOperand(2);
5210 SDValue FalseVal2 = Op2.getOperand(3);
5212
5213 SDValue V1Tmp = V1;
5214 SDValue V2Tmp = V2;
5215
5216 // Check that the registers and the constants match a max(min()) or min(max())
5217 // pattern
5218 if (V1Tmp != TrueVal1 || V2Tmp != TrueVal2 || K1 != FalseVal1 ||
5219 K2 != FalseVal2 ||
5220 !((isGTorGE(CC1) && isLTorLE(CC2)) || (isLTorLE(CC1) && isGTorGE(CC2))))
5221 return SDValue();
5222
5223 // Check that the constant in the lower-bound check is
5224 // the opposite of the constant in the upper-bound check
5225 // in 1's complement.
5227 return SDValue();
5228
5229 int64_t Val1 = cast<ConstantSDNode>(K1)->getSExtValue();
5230 int64_t Val2 = cast<ConstantSDNode>(K2)->getSExtValue();
5231 int64_t PosVal = std::max(Val1, Val2);
5232 int64_t NegVal = std::min(Val1, Val2);
5233
5234 if (!((Val1 > Val2 && isLTorLE(CC1)) || (Val1 < Val2 && isLTorLE(CC2))) ||
5235 !isPowerOf2_64(PosVal + 1))
5236 return SDValue();
5237
5238 // Handle the difference between USAT (unsigned) and SSAT (signed)
5239 // saturation
5240 // At this point, PosVal is guaranteed to be positive
5241 uint64_t K = PosVal;
5242 SDLoc dl(Op);
5243 if (Val1 == ~Val2)
5244 return DAG.getNode(ARMISD::SSAT, dl, VT, V2Tmp,
5245 DAG.getConstant(countTrailingOnes(K), dl, VT));
5246 if (NegVal == 0)
5247 return DAG.getNode(ARMISD::USAT, dl, VT, V2Tmp,
5248 DAG.getConstant(countTrailingOnes(K), dl, VT));
5249
5250 return SDValue();
5251}
5252
5253// Check if a condition of the type x < k ? k : x can be converted into a
5254// bit operation instead of conditional moves.
5255// Currently this is allowed given:
5256// - The conditions and values match up
5257// - k is 0 or -1 (all ones)
5258// This function will not check the last condition, thats up to the caller
5259// It returns true if the transformation can be made, and in such case
5260// returns x in V, and k in SatK.
5262 SDValue &SatK)
5263{
5264 SDValue LHS = Op.getOperand(0);
5265 SDValue RHS = Op.getOperand(1);
5266 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
5267 SDValue TrueVal = Op.getOperand(2);
5268 SDValue FalseVal = Op.getOperand(3);
5269
5270 SDValue *K = isa<ConstantSDNode>(LHS) ? &LHS : isa<ConstantSDNode>(RHS)
5271 ? &RHS
5272 : nullptr;
5273
5274 // No constant operation in comparison, early out
5275 if (!K)
5276 return false;
5277
5278 SDValue KTmp = isa<ConstantSDNode>(TrueVal) ? TrueVal : FalseVal;
5279 V = (KTmp == TrueVal) ? FalseVal : TrueVal;
5280 SDValue VTmp = (K && *K == LHS) ? RHS : LHS;
5281
5282 // If the constant on left and right side, or variable on left and right,
5283 // does not match, early out
5284 if (*K != KTmp || V != VTmp)
5285 return false;
5286
5287 if (isLowerSaturate(LHS, RHS, TrueVal, FalseVal, CC, *K)) {
5288 SatK = *K;
5289 return true;
5290 }
5291
5292 return false;
5293}
5294
5295bool ARMTargetLowering::isUnsupportedFloatingType(EVT VT) const {
5296 if (VT == MVT::f32)
5297 return !Subtarget->hasVFP2Base();
5298 if (VT == MVT::f64)
5299 return !Subtarget->hasFP64();
5300 if (VT == MVT::f16)
5301 return !Subtarget->hasFullFP16();
5302 return false;
5303}
5304
5305SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
5306 EVT VT = Op.getValueType();
5307 SDLoc dl(Op);
5308
5309 // Try to convert two saturating conditional selects into a single SSAT
5310 if ((!Subtarget->isThumb() && Subtarget->hasV6Ops()) || Subtarget->isThumb2())
5312 return SatValue;
5313
5314 // Try to convert expressions of the form x < k ? k : x (and similar forms)
5315 // into more efficient bit operations, which is possible when k is 0 or -1
5316 // On ARM and Thumb-2 which have flexible operand 2 this will result in
5317 // single instructions. On Thumb the shift and the bit operation will be two
5318 // instructions.
5319 // Only allow this transformation on full-width (32-bit) operations
5322 if (VT == MVT::i32 &&
5324 SDValue ShiftV = DAG.getNode(ISD::SRA, dl, VT, SatValue,
5325 DAG.getConstant(31, dl, VT));
5327 SDValue NotShiftV = DAG.getNode(ISD::XOR, dl, VT, ShiftV,
5328 DAG.getAllOnesConstant(dl, VT));
5329 return DAG.getNode(ISD::AND, dl, VT, SatValue, NotShiftV);
5331 return DAG.getNode(ISD::OR, dl, VT, SatValue, ShiftV);
5332 }
5333
5334 SDValue LHS = Op.getOperand(0);
5335 SDValue RHS = Op.getOperand(1);
5336 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
5337 SDValue TrueVal = Op.getOperand(2);
5338 SDValue FalseVal = Op.getOperand(3);
5341
5342 if (Subtarget->hasV8_1MMainlineOps() && CFVal && CTVal &&
5343 LHS.getValueType() == MVT::i32 && RHS.getValueType() == MVT::i32) {
5344 unsigned TVal = CTVal->getZExtValue();
5345 unsigned FVal = CFVal->getZExtValue();
5346 unsigned Opcode = 0;
5347
5348 if (TVal == ~FVal) {
5349 Opcode = ARMISD::CSINV;
5350 } else if (TVal == ~FVal + 1) {
5351 Opcode = ARMISD::CSNEG;
5352 } else if (TVal + 1 == FVal) {
5353 Opcode = ARMISD::CSINC;
5354 } else if (TVal == FVal + 1) {
5355 Opcode = ARMISD::CSINC;
5356 std::swap(TrueVal, FalseVal);
5358 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
5359 }
5360
5361 if (Opcode) {
5362 // If one of the constants is cheaper than another, materialise the
5363 // cheaper one and let the csel generate the other.
5364 if (Opcode != ARMISD::CSINC &&
5366 std::swap(TrueVal, FalseVal);
5368 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
5369 }
5370
5371 // Attempt to use ZR checking TVal is 0, possibly inverting the condition
5372 // to get there. CSINC not is invertable like the other two (~(~a) == a,
5373 // -(-a) == a, but (a+1)+1 != a).
5374 if (FVal == 0 && Opcode != ARMISD::CSINC) {
5375 std::swap(TrueVal, FalseVal);
5377 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
5378 }
5379
5380 // Drops F's value because we can get it by inverting/negating TVal.
5381 FalseVal = TrueVal;
5382
5383 SDValue ARMcc;
5384 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
5385 EVT VT = TrueVal.getValueType();
5386 return DAG.getNode(Opcode, dl, VT, TrueVal, FalseVal, ARMcc, Cmp);
5387 }
5388 }
5389
5390 if (isUnsupportedFloatingType(LHS.getValueType())) {
5392 DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS);
5393
5394 // If softenSetCCOperands only returned one value, we should compare it to
5395 // zero.
5396 if (!RHS.getNode()) {
5397 RHS = DAG.getConstant(0, dl, LHS.getValueType());
5398 CC = ISD::SETNE;
5399 }
5400 }
5401
5402 if (LHS.getValueType() == MVT::i32) {
5403 // Try to generate VSEL on ARMv8.
5404 // The VSEL instruction can't use all the usual ARM condition
5405 // codes: it only has two bits to select the condition code, so it's
5406 // constrained to use only GE, GT, VS and EQ.
5407 //
5408 // To implement all the various ISD::SETXXX opcodes, we sometimes need to
5409 // swap the operands of the previous compare instruction (effectively
5410 // inverting the compare condition, swapping 'less' and 'greater') and
5411 // sometimes need to swap the operands to the VSEL (which inverts the
5412 // condition in the sense of firing whenever the previous condition didn't)
5413 if (Subtarget->hasFPARMv8Base() && (TrueVal.getValueType() == MVT::f16 ||
5414 TrueVal.getValueType() == MVT::f32 ||
5415 TrueVal.getValueType() == MVT::f64)) {
5416 ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
5417 if (CondCode == ARMCC::LT || CondCode == ARMCC::LE ||
5418 CondCode == ARMCC::VC || CondCode == ARMCC::NE) {
5419 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
5420 std::swap(TrueVal, FalseVal);
5421 }
5422 }
5423
5424 SDValue ARMcc;
5425 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
5426 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
5427 // Choose GE over PL, which vsel does now support
5428 if (cast<ConstantSDNode>(ARMcc)->getZExtValue() == ARMCC::PL)
5429 ARMcc = DAG.getConstant(ARMCC::GE, dl, MVT::i32);
5430 return getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG);
5431 }
5432
5433 ARMCC::CondCodes CondCode, CondCode2;
5434 FPCCToARMCC(CC, CondCode, CondCode2);
5435
5436 // Normalize the fp compare. If RHS is zero we prefer to keep it there so we
5437 // match CMPFPw0 instead of CMPFP, though we don't do this for f16 because we
5438 // must use VSEL (limited condition codes), due to not having conditional f16
5439 // moves.
5440 if (Subtarget->hasFPARMv8Base() &&
5441 !(isFloatingPointZero(RHS) && TrueVal.getValueType() != MVT::f16) &&
5442 (TrueVal.getValueType() == MVT::f16 ||
5443 TrueVal.getValueType() == MVT::f32 ||
5444 TrueVal.getValueType() == MVT::f64)) {
5445 bool swpCmpOps = false;
5446 bool swpVselOps = false;
5448
5449 if (CondCode == ARMCC::GT || CondCode == ARMCC::GE ||
5450 CondCode == ARMCC::VS || CondCode == ARMCC::EQ) {
5451 if (swpCmpOps)
5452 std::swap(LHS, RHS);
5453 if (swpVselOps)
5454 std::swap(TrueVal, FalseVal);
5455 }
5456 }
5457
5458 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
5459 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
5460 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
5461 SDValue Result = getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG);
5462 if (CondCode2 != ARMCC::AL) {
5464 // FIXME: Needs another CMP because flag can have but one use.
5465 SDValue Cmp2 = getVFPCmp(LHS, RHS, DAG, dl);
5466 Result = getCMOV(dl, VT, Result, TrueVal, ARMcc2, CCR, Cmp2, DAG);
5467 }
5468 return Result;
5469}
5470
5471/// canChangeToInt - Given the fp compare operand, return true if it is suitable
5472/// to morph to an integer compare sequence.
5473static bool canChangeToInt(SDValue Op, bool &SeenZero,
5474 const ARMSubtarget *Subtarget) {
5475 SDNode *N = Op.getNode();
5476 if (!N->hasOneUse())
5477 // Otherwise it requires moving the value from fp to integer registers.
5478 return false;
5479 if (!N->getNumValues())
5480 return false;
5481 EVT VT = Op.getValueType();
5482 if (VT != MVT::f32 && !Subtarget->isFPBrccSlow())
5483 // f32 case is generally profitable. f64 case only makes sense when vcmpe +
5484 // vmrs are very slow, e.g. cortex-a8.
5485 return false;
5486
5487 if (isFloatingPointZero(Op)) {
5488 SeenZero = true;
5489 return true;
5490 }
5491 return ISD::isNormalLoad(N);
5492}
5493
5495 if (isFloatingPointZero(Op))
5496 return DAG.getConstant(0, SDLoc(Op), MVT::i32);
5497
5499 return DAG.getLoad(MVT::i32, SDLoc(Op), Ld->getChain(), Ld->getBasePtr(),
5500 Ld->getPointerInfo(), Ld->getAlignment(),
5501 Ld->getMemOperand()->getFlags());
5502
5503 llvm_unreachable("Unknown VFP cmp argument!");
5504}
5505
5508 SDLoc dl(Op);
5509
5510 if (isFloatingPointZero(Op)) {
5511 RetVal1 = DAG.getConstant(0, dl, MVT::i32);
5512 RetVal2 = DAG.getConstant(0, dl, MVT::i32);
5513 return;
5514 }
5515
5516 if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) {
5517 SDValue Ptr = Ld->getBasePtr();
5518 RetVal1 =
5519 DAG.getLoad(MVT::i32, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
5520 Ld->getAlignment(), Ld->getMemOperand()->getFlags());
5521
5522 EVT PtrType = Ptr.getValueType();
5523 unsigned NewAlign = MinAlign(Ld->getAlignment(), 4);
5524 SDValue NewPtr = DAG.getNode(ISD::ADD, dl,
5525 PtrType, Ptr, DAG.getConstant(4, dl, PtrType));
5526 RetVal2 = DAG.getLoad(MVT::i32, dl, Ld->getChain(), NewPtr,
5527 Ld->getPointerInfo().getWithOffset(4), NewAlign,
5528 Ld->getMemOperand()->getFlags());
5529 return;
5530 }
5531
5532 llvm_unreachable("Unknown VFP cmp argument!");
5533}
5534
5535/// OptimizeVFPBrcond - With -enable-unsafe-fp-math, it's legal to optimize some
5536/// f32 and even f64 comparisons to integer ones.
5537SDValue
5538ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const {
5539 SDValue Chain = Op.getOperand(0);
5540 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
5541 SDValue LHS = Op.getOperand(2);
5542 SDValue RHS = Op.getOperand(3);
5543 SDValue Dest = Op.getOperand(4);
5544 SDLoc dl(Op);
5545
5546 bool LHSSeenZero = false;
5547 bool LHSOk = canChangeToInt(LHS, LHSSeenZero, Subtarget);
5548 bool RHSSeenZero = false;
5549 bool RHSOk = canChangeToInt(RHS, RHSSeenZero, Subtarget);
5550 if (LHSOk && RHSOk && (LHSSeenZero || RHSSeenZero)) {
5551 // If unsafe fp math optimization is enabled and there are no other uses of
5552 // the CMP operands, and the condition code is EQ or NE, we can optimize it
5553 // to an integer comparison.
5554 if (CC == ISD::SETOEQ)
5555 CC = ISD::SETEQ;
5556 else if (CC == ISD::SETUNE)
5557 CC = ISD::SETNE;
5558
5559 SDValue Mask = DAG.getConstant(0x7fffffff, dl, MVT::i32);
5560 SDValue ARMcc;
5561 if (LHS.getValueType() == MVT::f32) {
5562 LHS = DAG.getNode(ISD::AND, dl, MVT::i32,
5563 bitcastf32Toi32(LHS, DAG), Mask);
5564 RHS = DAG.getNode(ISD::AND, dl, MVT::i32,
5565 bitcastf32Toi32(RHS, DAG), Mask);
5566 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
5567 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
5568 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other,
5569 Chain, Dest, ARMcc, CCR, Cmp);
5570 }
5571
5572 SDValue LHS1, LHS2;
5573 SDValue RHS1, RHS2;
5574 expandf64Toi32(LHS, DAG, LHS1, LHS2);
5575 expandf64Toi32(RHS, DAG, RHS1, RHS2);
5576 LHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, LHS2, Mask);
5577 RHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, RHS2, Mask);
5578 ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
5579 ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
5580 SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue);
5581 SDValue Ops[] = { Chain, ARMcc, LHS1, LHS2, RHS1, RHS2, Dest };
5582 return DAG.getNode(ARMISD::BCC_i64, dl, VTList, Ops);
5583 }
5584
5585 return SDValue();
5586}
5587
5588SDValue ARMTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
5589 SDValue Chain = Op.getOperand(0);
5590 SDValue Cond = Op.getOperand(1);
5591 SDValue Dest = Op.getOperand(2);
5592 SDLoc dl(Op);
5593
5594 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
5595 // instruction.
5596 unsigned Opc = Cond.getOpcode();
5597 bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) &&
5598 !Subtarget->isThumb1Only();
5599 if (Cond.getResNo() == 1 &&
5600 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
5601 Opc == ISD::USUBO || OptimizeMul)) {
5602 // Only lower legal XALUO ops.
5603 if (!DAG.getTargetLoweringInfo().isTypeLegal(Cond->getValueType(0)))
5604 return SDValue();
5605
5606 // The actual operation with overflow check.
5608 SDValue ARMcc;
5609 std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc);
5610
5611 // Reverse the condition code.
5612 ARMCC::CondCodes CondCode =
5614 CondCode = ARMCC::getOppositeCondition(CondCode);
5615 ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32);
5616 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
5617
5618 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, CCR,
5619 OverflowCmp);
5620 }
5621
5622 return SDValue();
5623}
5624
5625SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
5626 SDValue Chain = Op.getOperand(0);
5627 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
5628 SDValue LHS = Op.getOperand(2);
5629 SDValue RHS = Op.getOperand(3);
5630 SDValue Dest = Op.getOperand(4);
5631 SDLoc dl(Op);
5632
5633 if (isUnsupportedFloatingType(LHS.getValueType())) {
5635 DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS);
5636
5637 // If softenSetCCOperands only returned one value, we should compare it to
5638 // zero.
5639 if (!RHS.getNode()) {
5640 RHS = DAG.getConstant(0, dl, LHS.getValueType());
5641 CC = ISD::SETNE;
5642 }
5643 }
5644
5645 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
5646 // instruction.
5647 unsigned Opc = LHS.getOpcode();
5648 bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) &&
5649 !Subtarget->isThumb1Only();
5650 if (LHS.getResNo() == 1 && (isOneConstant(RHS) || isNullConstant(RHS)) &&
5651 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
5652 Opc == ISD::USUBO || OptimizeMul) &&
5653 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
5654 // Only lower legal XALUO ops.
5655 if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
5656 return SDValue();
5657
5658 // The actual operation with overflow check.
5660 SDValue ARMcc;
5661 std::tie(Value, OverflowCmp) = getARMXALUOOp(LHS.getValue(0), DAG, ARMcc);
5662
5663 if ((CC == ISD::SETNE) != isOneConstant(RHS)) {
5664 // Reverse the condition code.
5665 ARMCC::CondCodes CondCode =
5667 CondCode = ARMCC::getOppositeCondition(CondCode);
5668 ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32);
5669 }
5670 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
5671
5672 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, CCR,
5673 OverflowCmp);
5674 }
5675
5676 if (LHS.getValueType() == MVT::i32) {
5677 SDValue ARMcc;
5678 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
5679 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
5680 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other,
5681 Chain, Dest, ARMcc, CCR, Cmp);
5682 }
5683
5684 if (getTargetMachine().Options.UnsafeFPMath &&
5685 (CC == ISD::SETEQ || CC == ISD::SETOEQ ||
5686 CC == ISD::SETNE || CC == ISD::SETUNE)) {
5687 if (SDValue Result = OptimizeVFPBrcond(Op, DAG))
5688 return Result;
5689 }
5690
5691 ARMCC::CondCodes CondCode, CondCode2;
5692 FPCCToARMCC(CC, CondCode, CondCode2);
5693
5694 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
5695 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
5696 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
5697 SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue);
5698 SDValue Ops[] = { Chain, Dest, ARMcc, CCR, Cmp };
5699 SDValue Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops);
5700 if (CondCode2 != ARMCC::AL) {
5701 ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32);
5702 SDValue Ops[] = { Res, Dest, ARMcc, CCR, Res.getValue(1) };
5703 Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops);
5704 }
5705 return Res;
5706}
5707
5708SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
5709 SDValue Chain = Op.getOperand(0);
5710 SDValue Table = Op.getOperand(1);
5711 SDValue Index = Op.getOperand(2);
5712 SDLoc dl(Op);
5713
5714 EVT PTy = getPointerTy(DAG.getDataLayout());
5716 SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PTy);
5717 Table = DAG.getNode(ARMISD::WrapperJT, dl, MVT::i32, JTI);
5718 Index = DAG.getNode(ISD::MUL, dl, PTy, Index, DAG.getConstant(4, dl, PTy));
5719 SDValue Addr = DAG.getNode(ISD::ADD, dl, PTy, Table, Index);
5720 if (Subtarget->isThumb2() || (Subtarget->hasV8MBaselineOps() && Subtarget->isThumb())) {
5721 // Thumb2 and ARMv8-M use a two-level jump. That is, it jumps into the jump table
5722 // which does another jump to the destination. This also makes it easier
5723 // to translate it to TBB / TBH later (Thumb2 only).
5724 // FIXME: This might not work if the function is extremely large.
5725 return DAG.getNode(ARMISD::BR2_JT, dl, MVT::Other, Chain,
5726 Addr, Op.getOperand(2), JTI);
5727 }
5728 if (isPositionIndependent() || Subtarget->isROPI()) {
5729 Addr =
5730 DAG.getLoad((EVT)MVT::i32, dl, Chain, Addr,
5732 Chain = Addr.getValue(1);
5733 Addr = DAG.getNode(ISD::ADD, dl, PTy, Table, Addr);
5734 return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI);
5735 } else {
5736 Addr =
5737 DAG.getLoad(PTy, dl, Chain, Addr,
5739 Chain = Addr.getValue(1);
5740 return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI);
5741 }
5742}
5743
5745 EVT VT = Op.getValueType();
5746 SDLoc dl(Op);
5747
5748 if (Op.getValueType().getVectorElementType() == MVT::i32) {
5749 if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::f32)
5750 return Op;
5751 return DAG.UnrollVectorOp(Op.getNode());
5752 }
5753
5754 const bool HasFullFP16 =
5755 static_cast<const ARMSubtarget&>(DAG.getSubtarget()).hasFullFP16();
5756
5757 EVT NewTy;
5758 const EVT OpTy = Op.getOperand(0).getValueType();
5759 if (OpTy == MVT::v4f32)
5760 NewTy = MVT::v4i32;
5761 else if (OpTy == MVT::v4f16 && HasFullFP16)
5762 NewTy = MVT::v4i16;
5763 else if (OpTy == MVT::v8f16 && HasFullFP16)
5764 NewTy = MVT::v8i16;
5765 else
5766 llvm_unreachable("Invalid type for custom lowering!");
5767
5768 if (VT != MVT::v4i16 && VT != MVT::v8i16)
5769 return DAG.UnrollVectorOp(Op.getNode());
5770
5771 Op = DAG.getNode(Op.getOpcode(), dl, NewTy, Op.getOperand(0));
5772 return DAG.getNode(ISD::TRUNCATE, dl, VT, Op);
5773}
5774
5775SDValue ARMTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
5776 EVT VT = Op.getValueType();
5777 if (VT.isVector())
5778 return LowerVectorFP_TO_INT(Op, DAG);
5779
5780 bool IsStrict = Op->isStrictFPOpcode();
5781 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
5782
5783 if (isUnsupportedFloatingType(SrcVal.getValueType())) {
5785 if (Op.getOpcode() == ISD::FP_TO_SINT ||
5786 Op.getOpcode() == ISD::STRICT_FP_TO_SINT)
5787 LC = RTLIB::getFPTOSINT(SrcVal.getValueType(),
5788 Op.getValueType());
5789 else
5790 LC = RTLIB::getFPTOUINT(SrcVal.getValueType(),
5791 Op.getValueType());
5792 SDLoc Loc(Op);
5793 MakeLibCallOptions CallOptions;
5794 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
5796 std::tie(Result, Chain) = makeLibCall(DAG, LC, Op.getValueType(), SrcVal,
5797 CallOptions, Loc, Chain);
5798 return IsStrict ? DAG.getMergeValues({Result, Chain}, Loc) : Result;
5799 }
5800
5801 // FIXME: Remove this when we have strict fp instruction selection patterns
5802 if (IsStrict) {
5803 SDLoc Loc(Op);
5804 SDValue Result =
5806 : ISD::FP_TO_UINT,
5807 Loc, Op.getValueType(), SrcVal);
5808 return DAG.getMergeValues({Result, Op.getOperand(0)}, Loc);
5809 }
5810
5811 return Op;
5812}
5813
5815 EVT VT = Op.getValueType();
5816 SDLoc dl(Op);
5817
5818 if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::i32) {
5819 if (VT.getVectorElementType() == MVT::f32)
5820 return Op;
5821 return DAG.UnrollVectorOp(Op.getNode());
5822 }
5823
5824 assert((Op.getOperand(0).getValueType() == MVT::v4i16 ||
5825 Op.getOperand(0).getValueType() == MVT::v8i16) &&
5826 "Invalid type for custom lowering!");
5827
5828 const bool HasFullFP16 =
5829 static_cast<const ARMSubtarget&>(DAG.getSubtarget()).hasFullFP16();
5830
5832 if (VT == MVT::v4f32)
5834 else if (VT == MVT::v4f16 && HasFullFP16)
5836 else if (VT == MVT::v8f16 && HasFullFP16)
5838 else
5839 return DAG.UnrollVectorOp(Op.getNode());
5840
5841 unsigned CastOpc;
5842 unsigned Opc;
5843 switch (Op.getOpcode()) {
5844 default: llvm_unreachable("Invalid opcode!");
5845 case ISD::SINT_TO_FP:
5847 Opc = ISD::SINT_TO_FP;
5848 break;
5849 case ISD::UINT_TO_FP:
5851 Opc = ISD::UINT_TO_FP;
5852 break;
5853 }
5854
5855 Op = DAG.getNode(CastOpc, dl, DestVecType, Op.getOperand(0));
5856 return DAG.getNode(Opc, dl, VT, Op);
5857}
5858
5859SDValue ARMTargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const {
5860 EVT VT = Op.getValueType();
5861 if (VT.isVector())
5862 return LowerVectorINT_TO_FP(Op, DAG);
5863 if (isUnsupportedFloatingType(VT)) {
5865 if (Op.getOpcode() == ISD::SINT_TO_FP)
5866 LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(),
5867 Op.getValueType());
5868 else
5869 LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(),
5870 Op.getValueType());
5871 MakeLibCallOptions CallOptions;
5872 return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0),
5873 CallOptions, SDLoc(Op)).first;
5874 }
5875
5876 return Op;
5877}
5878
5879SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
5880 // Implement fcopysign with a fabs and a conditional fneg.
5881 SDValue Tmp0 = Op.getOperand(0);
5882 SDValue Tmp1 = Op.getOperand(1);
5883 SDLoc dl(Op);
5884 EVT VT = Op.getValueType();
5885 EVT SrcVT = Tmp1.getValueType();
5886 bool InGPR = Tmp0.getOpcode() == ISD::BITCAST ||
5887 Tmp0.getOpcode() == ARMISD::VMOVDRR;
5888 bool UseNEON = !InGPR && Subtarget->hasNEON();
5889
5890 if (UseNEON) {
5891 // Use VBSL to copy the sign bit.
5892 unsigned EncodedVal = ARM_AM::createVMOVModImm(0x6, 0x80);
5895 EVT OpVT = (VT == MVT::f32) ? MVT::v2i32 : MVT::v1i64;
5896 if (VT == MVT::f64)
5897 Mask = DAG.getNode(ARMISD::VSHLIMM, dl, OpVT,
5898 DAG.getNode(ISD::BITCAST, dl, OpVT, Mask),
5899 DAG.getConstant(32, dl, MVT::i32));
5900 else /*if (VT == MVT::f32)*/
5902 if (SrcVT == MVT::f32) {
5904 if (VT == MVT::f64)
5905 Tmp1 = DAG.getNode(ARMISD::VSHLIMM, dl, OpVT,
5906 DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1),
5907 DAG.getConstant(32, dl, MVT::i32));
5908 } else if (VT == MVT::f32)
5911 DAG.getConstant(32, dl, MVT::i32));
5912 Tmp0 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp0);
5913 Tmp1 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1);
5914
5915 SDValue AllOnes = DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0xff),
5916 dl, MVT::i32);
5917 AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v8i8, AllOnes);
5918 SDValue MaskNot = DAG.getNode(ISD::XOR, dl, OpVT, Mask,
5919 DAG.getNode(ISD::BITCAST, dl, OpVT, AllOnes));
5920
5921 SDValue Res = DAG.getNode(ISD::OR, dl, OpVT,
5922 DAG.getNode(ISD::AND, dl, OpVT, Tmp1, Mask),
5923 DAG.getNode(ISD::AND, dl, OpVT, Tmp0, MaskNot));
5924 if (VT == MVT::f32) {
5925 Res = DAG.getNode(ISD::BITCAST, dl, MVT::v2f32, Res);
5926 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
5927 DAG.getConstant(0, dl, MVT::i32));
5928 } else {
5929 Res = DAG.getNode(ISD::BITCAST, dl, MVT::f64, Res);
5930 }
5931
5932 return Res;
5933 }
5934
5935 // Bitcast operand 1 to i32.
5936 if (SrcVT == MVT::f64)
5938 Tmp1).getValue(1);
5939 Tmp1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp1);
5940
5941 // Or in the signbit with integer operations.
5942 SDValue Mask1 = DAG.getConstant(0x80000000, dl, MVT::i32);
5943 SDValue Mask2 = DAG.getConstant(0x7fffffff, dl, MVT::i32);
5944 Tmp1 = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp1, Mask1);
5945 if (VT == MVT::f32) {
5946 Tmp0 = DAG.getNode(ISD::AND, dl, MVT::i32,
5947 DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp0), Mask2);
5948 return DAG.getNode(ISD::BITCAST, dl, MVT::f32,
5949 DAG.getNode(ISD::OR, dl, MVT::i32, Tmp0, Tmp1));
5950 }
5951
5952 // f64: Or the high part with signbit and then combine two parts.
5954 Tmp0);
5955 SDValue Lo = Tmp0.getValue(0);
5956 SDValue Hi = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp0.getValue(1), Mask2);
5957 Hi = DAG.getNode(ISD::OR, dl, MVT::i32, Hi, Tmp1);
5958 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
5959}
5960
5961SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{
5963 MachineFrameInfo &MFI = MF.getFrameInfo();
5964 MFI.setReturnAddressIsTaken(true);
5965
5967 return SDValue();
5968
5969 EVT VT = Op.getValueType();
5970 SDLoc dl(Op);
5971 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
5972 if (Depth) {
5973 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
5974 SDValue Offset = DAG.getConstant(4, dl, MVT::i32);
5975 return DAG.getLoad(VT, dl, DAG.getEntryNode(),
5976 DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset),
5978 }
5979
5980 // Return LR, which contains the return address. Mark it an implicit live-in.
5981 unsigned Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32));
5982 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
5983}
5984
5985SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
5986 const ARMBaseRegisterInfo &ARI =
5987 *static_cast<const ARMBaseRegisterInfo*>(RegInfo);
5989 MachineFrameInfo &MFI = MF.getFrameInfo();
5990 MFI.setFrameAddressIsTaken(true);
5991
5992 EVT VT = Op.getValueType();
5993 SDLoc dl(Op); // FIXME probably not meaningful
5994 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
5995 Register FrameReg = ARI.getFrameRegister(MF);
5996 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
5997 while (Depth--)
5998 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
6000 return FrameAddr;
6001}
6002
6003// FIXME? Maybe this could be a TableGen attribute on some registers and
6004// this table could be generated automatically from RegInfo.
6005Register ARMTargetLowering::getRegisterByName(const char* RegName, LLT VT,
6006 const MachineFunction &MF) const {
6008 .Case("sp", ARM::SP)
6009 .Default(0);
6010 if (Reg)
6011 return Reg;
6012 report_fatal_error(Twine("Invalid register name \""
6013 + StringRef(RegName) + "\"."));
6014}
6015
6016// Result is 64 bit value so split into two 32 bit values and return as a
6017// pair of values.
6019 SelectionDAG &DAG) {
6020 SDLoc DL(N);
6021
6022 // This function is only supposed to be called for i64 type destination.
6023 assert(N->getValueType(0) == MVT::i64
6024 && "ExpandREAD_REGISTER called for non-i64 type result.");
6025
6028 N->getOperand(0),
6029 N->getOperand(1));
6030
6031 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Read.getValue(0),
6032 Read.getValue(1)));
6033 Results.push_back(Read.getOperand(0));
6034}
6035
6036/// \p BC is a bitcast that is about to be turned into a VMOVDRR.
6037/// When \p DstVT, the destination type of \p BC, is on the vector
6038/// register bank and the source of bitcast, \p Op, operates on the same bank,
6039/// it might be possible to combine them, such that everything stays on the
6040/// vector register bank.
6041/// \p return The node that would replace \p BT, if the combine
6042/// is possible.
6044 SelectionDAG &DAG) {
6045 SDValue Op = BC->getOperand(0);
6046 EVT DstVT = BC->getValueType(0);
6047
6048 // The only vector instruction that can produce a scalar (remember,
6049 // since the bitcast was about to be turned into VMOVDRR, the source
6050 // type is i64) from a vector is EXTRACT_VECTOR_ELT.
6051 // Moreover, we can do this combine only if there is one use.
6052 // Finally, if the destination type is not a vector, there is not
6053 // much point on forcing everything on the vector bank.
6054 if (!DstVT.isVector() || Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6055 !Op.hasOneUse())
6056 return SDValue();
6057
6058 // If the index is not constant, we will introduce an additional
6059 // multiply that will stick.
6060 // Give up in that case.
6061 ConstantSDNode *Index = dyn_cast<ConstantSDNode>(Op.getOperand(1));
6062 if (!Index)
6063 return SDValue();
6064 unsigned DstNumElt = DstVT.getVectorNumElements();
6065
6066 // Compute the new index.
6067 const APInt &APIntIndex = Index->getAPIntValue();
6068 APInt NewIndex(APIntIndex.getBitWidth(), DstNumElt);
6070 // Check if the new constant index fits into i32.
6071 if (NewIndex.getBitWidth() > 32)
6072 return SDValue();
6073
6074 // vMTy bitcast(i64 extractelt vNi64 src, i32 index) ->
6075 // vMTy extractsubvector vNxMTy (bitcast vNi64 src), i32 index*M)
6076 SDLoc dl(Op);
6077 SDValue ExtractSrc = Op.getOperand(0);
6078 EVT VecVT = EVT::getVectorVT(
6079 *DAG.getContext(), DstVT.getScalarType(),
6080 ExtractSrc.getValueType().getVectorNumElements() * DstNumElt);
6081 SDValue BitCast = DAG.getNode(ISD::BITCAST, dl, VecVT, ExtractSrc);
6083 DAG.getConstant(NewIndex.getZExtValue(), dl, MVT::i32));
6084}
6085
6086/// ExpandBITCAST - If the target supports VFP, this function is called to
6087/// expand a bit convert where either the source or destination type is i64 to
6088/// use a VMOVDRR or VMOVRRD node. This should not be done when the non-i64
6089/// operand type is illegal (e.g., v2f32 for a target that doesn't support
6090/// vectors), since the legalizer won't know what to do with that.
6091SDValue ARMTargetLowering::ExpandBITCAST(SDNode *N, SelectionDAG &DAG,
6092 const ARMSubtarget *Subtarget) const {
6093 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6094 SDLoc dl(N);
6095 SDValue Op = N->getOperand(0);
6096
6097 // This function is only supposed to be called for i16 and i64 types, either
6098 // as the source or destination of the bit convert.
6099 EVT SrcVT = Op.getValueType();
6100 EVT DstVT = N->getValueType(0);
6101
6102 if ((SrcVT == MVT::i16 || SrcVT == MVT::i32) &&
6103 (DstVT == MVT::f16 || DstVT == MVT::bf16))
6104 return MoveToHPR(SDLoc(N), DAG, MVT::i32, DstVT.getSimpleVT(),
6106
6107 if ((DstVT == MVT::i16 || DstVT == MVT::i32) &&
6108 (SrcVT == MVT::f16 || SrcVT == MVT::bf16))
6109 return DAG.getNode(
6111 MoveFromHPR(SDLoc(N), DAG, MVT::i32, SrcVT.getSimpleVT(), Op));
6112
6113 if (!(SrcVT == MVT::i64 || DstVT == MVT::i64))
6114 return SDValue();
6115
6116 // Turn i64->f64 into VMOVDRR.
6117 if (SrcVT == MVT::i64 && TLI.isTypeLegal(DstVT)) {
6118 // Do not force values to GPRs (this is what VMOVDRR does for the inputs)
6119 // if we can combine the bitcast with its source.
6121 return Val;
6122
6124 DAG.getConstant(0, dl, MVT::i32));
6126 DAG.getConstant(1, dl, MVT::i32));
6127 return DAG.getNode(ISD::BITCAST, dl, DstVT,
6128 DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi));
6129 }
6130
6131 // Turn f64->i64 into VMOVRRD.
6132 if (DstVT == MVT::i64 && TLI.isTypeLegal(SrcVT)) {
6133 SDValue Cvt;
6134 if (DAG.getDataLayout().isBigEndian() && SrcVT.isVector() &&
6135 SrcVT.getVectorNumElements() > 1)
6136 Cvt = DAG.getNode(ARMISD::VMOVRRD, dl,
6138 DAG.getNode(ARMISD::VREV64, dl, SrcVT, Op));
6139 else
6140 Cvt = DAG.getNode(ARMISD::VMOVRRD, dl,
6141 DAG.getVTList(MVT::i32, MVT::i32), Op);
6142 // Merge the pieces into a single i64 value.
6143 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Cvt, Cvt.getValue(1));
6144 }
6145
6146 return SDValue();
6147}
6148
6149/// getZeroVector - Returns a vector of specified type with all zero elements.
6150/// Zero vectors are used to represent vector negation and in those cases
6151/// will be implemented with the NEON VNEG instruction. However, VNEG does
6152/// not support i64 elements, so sometimes the zero vectors will need to be
6153/// explicitly constructed. Regardless, use a canonical VMOV to create the
6154/// zero vector.
6155static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
6156 assert(VT.isVector() && "Expected a vector type");
6157 // The canonical modified immediate encoding of a zero vector is....0!
6161 return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
6162}
6163
6164/// LowerShiftRightParts - Lower SRA_PARTS, which returns two
6165/// i32 values and take a 2 x i32 value to shift plus a shift amount.
6166SDValue ARMTargetLowering::LowerShiftRightParts(SDValue Op,
6167 SelectionDAG &DAG) const {
6168 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
6169 EVT VT = Op.getValueType();
6170 unsigned VTBits = VT.getSizeInBits();
6171 SDLoc dl(Op);
6172 SDValue ShOpLo = Op.getOperand(0);
6173 SDValue ShOpHi = Op.getOperand(1);
6174 SDValue ShAmt = Op.getOperand(2);
6175 SDValue ARMcc;
6176 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
6177 unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
6178
6179 assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
6180
6182 DAG.getConstant(VTBits, dl, MVT::i32), ShAmt);
6183 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
6184 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
6185 DAG.getConstant(VTBits, dl, MVT::i32));
6186 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
6187 SDValue LoSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
6188 SDValue LoBigShift = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
6189 SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
6190 ISD::SETGE, ARMcc, DAG, dl);
6192 ARMcc, CCR, CmpLo);
6193
6194 SDValue HiSmallShift = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
6195 SDValue HiBigShift = Opc == ISD::SRA
6196 ? DAG.getNode(Opc, dl, VT, ShOpHi,
6197 DAG.getConstant(VTBits - 1, dl, VT))
6198 : DAG.getConstant(0, dl, VT);
6199 SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
6200 ISD::SETGE, ARMcc, DAG, dl);
6202 ARMcc, CCR, CmpHi);
6203
6204 SDValue Ops[2] = { Lo, Hi };
6205 return DAG.getMergeValues(Ops, dl);
6206}
6207
6208/// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
6209/// i32 values and take a 2 x i32 value to shift plus a shift amount.
6210SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op,
6211 SelectionDAG &DAG) const {
6212 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
6213 EVT VT = Op.getValueType();
6214 unsigned VTBits = VT.getSizeInBits();
6215 SDLoc dl(Op);
6216 SDValue ShOpLo = Op.getOperand(0);
6217 SDValue ShOpHi = Op.getOperand(1);
6218 SDValue ShAmt = Op.getOperand(2);
6219 SDValue ARMcc;
6220 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
6221
6222 assert(Op.getOpcode() == ISD::SHL_PARTS);
6224 DAG.getConstant(VTBits, dl, MVT::i32), ShAmt);
6225 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
6226 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
6227 SDValue HiSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
6228
6229 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
6230 DAG.getConstant(VTBits, dl, MVT::i32));
6232 SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
6233 ISD::SETGE, ARMcc, DAG, dl);
6235 ARMcc, CCR, CmpHi);
6236
6237 SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
6238 ISD::SETGE, ARMcc, DAG, dl);
6239 SDValue LoSmallShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
6241 DAG.getConstant(0, dl, VT), ARMcc, CCR, CmpLo);
6242
6243 SDValue Ops[2] = { Lo, Hi };
6244 return DAG.getMergeValues(Ops, dl);
6245}
6246
6247SDValue ARMTargetLowering::LowerFLT_ROUNDS_(SDValue Op,
6248 SelectionDAG &DAG) const {
6249 // The rounding mode is in bits 23:22 of the FPSCR.
6250 // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
6251 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
6252 // so that the shift + and get folded into a bitfield extract.
6253 SDLoc dl(Op);
6254 SDValue Chain = Op.getOperand(0);
6255 SDValue Ops[] = {Chain,
6256 DAG.getConstant(Intrinsic::arm_get_fpscr, dl, MVT::i32)};
6257
6258 SDValue FPSCR =
6260 Chain = FPSCR.getValue(1);
6262 DAG.getConstant(1U << 22, dl, MVT::i32));
6264 DAG.getConstant(22, dl, MVT::i32));
6266 DAG.getConstant(3, dl, MVT::i32));
6267 return DAG.getMergeValues({And, Chain}, dl);
6268}
6269
6270SDValue ARMTargetLowering::LowerSET_ROUNDING(SDValue Op,
6271 SelectionDAG &DAG) const {
6272 SDLoc DL(Op);
6273 SDValue Chain = Op->getOperand(0);
6274 SDValue RMValue = Op->getOperand(1);
6275
6276 // The rounding mode is in bits 23:22 of the FPSCR.
6277 // The llvm.set.rounding argument value to ARM rounding mode value mapping
6278 // is 0->3, 1->0, 2->1, 3->2. The formula we use to implement this is
6279 // ((arg - 1) & 3) << 22).
6280 //
6281 // It is expected that the argument of llvm.set.rounding is within the
6282 // segment [0, 3], so NearestTiesToAway (4) is not handled here. It is
6283 // responsibility of the code generated llvm.set.rounding to ensure this
6284 // condition.
6285
6286 // Calculate new value of FPSCR[23:22].
6288 DAG.getConstant(1, DL, MVT::i32));
6290 DAG.getConstant(0x3, DL, MVT::i32));
6293
6294 // Get current value of FPSCR.
6295 SDValue Ops[] = {Chain,
6296 DAG.getConstant(Intrinsic::arm_get_fpscr, DL, MVT::i32)};
6297 SDValue FPSCR =
6299 Chain = FPSCR.getValue(1);
6300 FPSCR = FPSCR.getValue(0);
6301
6302 // Put new rounding mode into FPSCR[23:22].
6303 const unsigned RMMask = ~(ARM::Rounding::rmMask << ARM::RoundingBitsPos);
6305 DAG.getConstant(RMMask, DL, MVT::i32));
6307 SDValue Ops2[] = {
6308 Chain, DAG.getConstant(Intrinsic::arm_set_fpscr, DL, MVT::i32), FPSCR};
6310}
6311
6313 const ARMSubtarget *ST) {
6314 SDLoc dl(N);
6315 EVT VT = N->getValueType(0);
6316 if (VT.isVector() && ST->hasNEON()) {
6317
6318 // Compute the least significant set bit: LSB = X & -X
6319 SDValue X = N->getOperand(0);
6320 SDValue NX = DAG.getNode(ISD::SUB, dl, VT, getZeroVector(VT, DAG, dl), X);
6321 SDValue LSB = DAG.getNode(ISD::AND, dl, VT, X, NX);
6322
6323 EVT ElemTy = VT.getVectorElementType();
6324
6325 if (ElemTy == MVT::i8) {
6326 // Compute with: cttz(x) = ctpop(lsb - 1)
6327 SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
6328 DAG.getTargetConstant(1, dl, ElemTy));
6329 SDValue Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One);
6330 return DAG.getNode(ISD::CTPOP, dl, VT, Bits);
6331 }
6332
6333 if ((ElemTy == MVT::i16 || ElemTy == MVT::i32) &&
6334 (N->getOpcode() == ISD::CTTZ_ZERO_UNDEF)) {
6335 // Compute with: cttz(x) = (width - 1) - ctlz(lsb), if x != 0
6336 unsigned NumBits = ElemTy.getSizeInBits();
6338 DAG.getNode(ARMISD::VMOVIMM, dl, VT,
6339 DAG.getTargetConstant(NumBits - 1, dl, ElemTy));
6340 SDValue CTLZ = DAG.getNode(ISD::CTLZ, dl, VT, LSB);
6341 return DAG.getNode(ISD::SUB, dl, VT, WidthMinus1, CTLZ);
6342 }
6343
6344 // Compute with: cttz(x) = ctpop(lsb - 1)
6345
6346 // Compute LSB - 1.
6347 SDValue Bits;
6348 if (ElemTy == MVT::i64) {
6349 // Load constant 0xffff'ffff'ffff'ffff to register.
6350 SDValue FF = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
6351 DAG.getTargetConstant(0x1eff, dl, MVT::i32));
6352 Bits = DAG.getNode(ISD::ADD, dl, VT, LSB, FF);
6353 } else {
6354 SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
6355 DAG.getTargetConstant(1, dl, ElemTy));
6356 Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One);
6357 }
6358 return DAG.getNode(ISD::CTPOP, dl, VT, Bits);
6359 }
6360
6361 if (!ST->hasV6T2Ops())
6362 return SDValue();
6363
6364 SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, VT, N->getOperand(0));
6365 return DAG.getNode(ISD::CTLZ, dl, VT, rbit);
6366}
6367
6369 const ARMSubtarget *ST) {
6370 EVT VT = N->getValueType(0);
6371 SDLoc DL(N);
6372
6373 assert(ST->hasNEON() && "Custom ctpop lowering requires NEON.");
6374 assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 ||
6375 VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) &&
6376 "Unexpected type for custom ctpop lowering");
6377
6378 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6380 SDValue Res = DAG.getBitcast(VT8Bit, N->getOperand(0));
6381 Res = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Res);
6382
6383 // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
6384 unsigned EltSize = 8;
6385 unsigned NumElts = VT.is64BitVector() ? 8 : 16;
6386 while (EltSize != VT.getScalarSizeInBits()) {
6388 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddlu, DL,
6389 TLI.getPointerTy(DAG.getDataLayout())));
6390 Ops.push_back(Res);
6391
6392 EltSize *= 2;
6393 NumElts /= 2;
6395 Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, WidenVT, Ops);
6396 }
6397
6398 return Res;
6399}
6400
6401/// Getvshiftimm - Check if this is a valid build_vector for the immediate
6402/// operand of a vector shift operation, where all the elements of the
6403/// build_vector must have the same constant integer value.
6404static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
6405 // Ignore bit_converts.
6406 while (Op.getOpcode() == ISD::BITCAST)
6407 Op = Op.getOperand(0);
6409 APInt SplatBits, SplatUndef;
6410 unsigned SplatBitSize;
6411 bool HasAnyUndefs;
6412 if (!BVN ||
6413 !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs,
6414 ElementBits) ||
6415 SplatBitSize > ElementBits)
6416 return false;
6417 Cnt = SplatBits.getSExtValue();
6418 return true;
6419}
6420
6421/// isVShiftLImm - Check if this is a valid build_vector for the immediate
6422/// operand of a vector shift left operation. That value must be in the range:
6423/// 0 <= Value < ElementBits for a left shift; or
6424/// 0 <= Value <= ElementBits for a long left shift.
6425static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
6426 assert(VT.isVector() && "vector shift count is not a vector type");
6427 int64_t ElementBits = VT.getScalarSizeInBits();
6428 if (!getVShiftImm(Op, ElementBits, Cnt))
6429 return false;
6430 return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
6431}
6432
6433/// isVShiftRImm - Check if this is a valid build_vector for the immediate
6434/// operand of a vector shift right operation. For a shift opcode, the value
6435/// is positive, but for an intrinsic the value count must be negative. The
6436/// absolute value must be in the range:
6437/// 1 <= |Value| <= ElementBits for a right shift; or
6438/// 1 <= |Value| <= ElementBits/2 for a narrow right shift.
6439static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic,
6440 int64_t &Cnt) {
6441 assert(VT.isVector() && "vector shift count is not a vector type");
6442 int64_t ElementBits = VT.getScalarSizeInBits();
6443 if (!getVShiftImm(Op, ElementBits, Cnt))
6444 return false;
6445 if (!isIntrinsic)
6446 return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
6447 if (Cnt >= -(isNarrow ? ElementBits / 2 : ElementBits) && Cnt <= -1) {
6448 Cnt = -Cnt;
6449 return true;
6450 }
6451 return false;
6452}
6453
6455 const ARMSubtarget *ST) {
6456 EVT VT = N->getValueType(0);
6457 SDLoc dl(N);
6458 int64_t Cnt;
6459
6460 if (!VT.isVector())
6461 return SDValue();
6462
6463 // We essentially have two forms here. Shift by an immediate and shift by a
6464 // vector register (there are also shift by a gpr, but that is just handled
6465 // with a tablegen pattern). We cannot easily match shift by an immediate in
6466 // tablegen so we do that here and generate a VSHLIMM/VSHRsIMM/VSHRuIMM.
6467 // For shifting by a vector, we don't have VSHR, only VSHL (which can be
6468 // signed or unsigned, and a negative shift indicates a shift right).
6469 if (N->getOpcode() == ISD::SHL) {
6470 if (isVShiftLImm(N->getOperand(1), VT, false, Cnt))
6471 return DAG.getNode(ARMISD::VSHLIMM, dl, VT, N->getOperand(0),
6472 DAG.getConstant(Cnt, dl, MVT::i32));
6473 return DAG.getNode(ARMISD::VSHLu, dl, VT, N->getOperand(0),
6474 N->getOperand(1));
6475 }
6476
6477 assert((N->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL) &&
6478 "unexpected vector shift opcode");
6479
6480 if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) {
6481 unsigned VShiftOpc =
6482 (N->getOpcode() == ISD::SRA ? ARMISD::VSHRsIMM : ARMISD::VSHRuIMM);
6483 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0),
6484 DAG.getConstant(Cnt, dl, MVT::i32));
6485 }
6486
6487 // Other right shifts we don't have operations for (we use a shift left by a
6488 // negative number).
6489 EVT ShiftVT = N->getOperand(1).getValueType();
6491 ISD::SUB, dl, ShiftVT, getZeroVector(ShiftVT, DAG, dl), N->getOperand(1));
6492 unsigned VShiftOpc =
6493 (N->getOpcode() == ISD::SRA ? ARMISD::VSHLs : ARMISD::VSHLu);
6494 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0), NegatedCount);
6495}
6496
6498 const ARMSubtarget *ST) {
6499 EVT VT = N->getValueType(0);
6500 SDLoc dl(N);
6501
6502 // We can get here for a node like i32 = ISD::SHL i32, i64
6503 if (VT != MVT::i64)
6504 return SDValue();
6505
6506 assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA ||
6507 N->getOpcode() == ISD::SHL) &&
6508 "Unknown shift to lower!");
6509
6510 unsigned ShOpc = N->getOpcode();
6511 if (ST->hasMVEIntegerOps()) {
6512 SDValue ShAmt = N->getOperand(1);
6513 unsigned ShPartsOpc = ARMISD::LSLL;
6515
6516 // If the shift amount is greater than 32 or has a greater bitwidth than 64
6517 // then do the default optimisation
6518 if (ShAmt->getValueType(0).getSizeInBits() > 64 ||
6519 (Con && (Con->getZExtValue() == 0 || Con->getZExtValue() >= 32)))
6520 return SDValue();
6521
6522 // Extract the lower 32 bits of the shift amount if it's not an i32
6523 if (ShAmt->getValueType(0) != MVT::i32)
6524 ShAmt = DAG.getZExtOrTrunc(ShAmt, dl, MVT::i32);
6525
6526 if (ShOpc == ISD::SRL) {
6527 if (!Con)
6528 // There is no t2LSRLr instruction so negate and perform an lsll if the
6529 // shift amount is in a register, emulating a right shift.
6530 ShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
6531 DAG.getConstant(0, dl, MVT::i32), ShAmt);
6532 else
6533 // Else generate an lsrl on the immediate shift amount
6535 } else if (ShOpc == ISD::SRA)
6537
6538 // Lower 32 bits of the destination/source
6539 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0),
6540 DAG.getConstant(0, dl, MVT::i32));
6541 // Upper 32 bits of the destination/source
6542 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0),
6543 DAG.getConstant(1, dl, MVT::i32));
6544
6545 // Generate the shift operation as computed above
6546 Lo = DAG.getNode(ShPartsOpc, dl, DAG.getVTList(MVT::i32, MVT::i32), Lo, Hi,
6547 ShAmt);
6548 // The upper 32 bits come from the second return value of lsll
6549 Hi = SDValue(Lo.getNode(), 1);
6550 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
6551 }
6552
6553 // We only lower SRA, SRL of 1 here, all others use generic lowering.
6554 if (!isOneConstant(N->getOperand(1)) || N->getOpcode() == ISD::SHL)
6555 return SDValue();
6556
6557 // If we are in thumb mode, we don't have RRX.
6558 if (ST->isThumb1Only())
6559 return SDValue();
6560
6561 // Okay, we have a 64-bit SRA or SRL of 1. Lower this to an RRX expr.
6562 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0),
6563 DAG.getConstant(0, dl, MVT::i32));
6564 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0),
6565 DAG.getConstant(1, dl, MVT::i32));
6566
6567 // First, build a SRA_FLAG/SRL_FLAG op, which shifts the top part by one and
6568 // captures the result into a carry flag.
6569 unsigned Opc = N->getOpcode() == ISD::SRL ? ARMISD::SRL_FLAG:ARMISD::SRA_FLAG;
6570 Hi = DAG.getNode(Opc, dl, DAG.getVTList(MVT::i32, MVT::Glue), Hi);
6571
6572 // The low part is an ARMISD::RRX operand, which shifts the carry in.
6573 Lo = DAG.getNode(ARMISD::RRX, dl, MVT::i32, Lo, Hi.getValue(1));
6574
6575 // Merge the pieces into a single i64 value.
6576 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
6577}
6578
6580 const ARMSubtarget *ST) {
6581 bool Invert = false;
6582 bool Swap = false;
6583 unsigned Opc = ARMCC::AL;
6584
6585 SDValue Op0 = Op.getOperand(0);
6586 SDValue Op1 = Op.getOperand(1);
6587 SDValue CC = Op.getOperand(2);
6588 EVT VT = Op.getValueType();
6590 SDLoc dl(Op);
6591
6592 EVT CmpVT;
6593 if (ST->hasNEON())
6595 else {
6596 assert(ST->hasMVEIntegerOps() &&
6597 "No hardware support for integer vector comparison!");
6598
6599 if (Op.getValueType().getVectorElementType() != MVT::i1)
6600 return SDValue();
6601
6602 // Make sure we expand floating point setcc to scalar if we do not have
6603 // mve.fp, so that we can handle them from there.
6604 if (Op0.getValueType().isFloatingPoint() && !ST->hasMVEFloatOps())
6605 return SDValue();
6606
6607 CmpVT = VT;
6608 }
6609
6612 // Special-case integer 64-bit equality comparisons. They aren't legal,
6613 // but they can be lowered with a few vector instructions.
6614 unsigned CmpElements = CmpVT.getVectorNumElements() * 2;
6616 SDValue CastOp0 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op0);
6617 SDValue CastOp1 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op1);
6619 DAG.getCondCode(ISD::SETEQ));
6620 SDValue Reversed = DAG.getNode(ARMISD::VREV64, dl, SplitVT, Cmp);
6621 SDValue Merged = DAG.getNode(ISD::AND, dl, SplitVT, Cmp, Reversed);
6622 Merged = DAG.getNode(ISD::BITCAST, dl, CmpVT, Merged);
6623 if (SetCCOpcode == ISD::SETNE)
6624 Merged = DAG.getNOT(dl, Merged, CmpVT);
6625 Merged = DAG.getSExtOrTrunc(Merged, dl, VT);
6626 return Merged;
6627 }
6628
6629 if (CmpVT.getVectorElementType() == MVT::i64)
6630 // 64-bit comparisons are not legal in general.
6631 return SDValue();
6632
6633 if (Op1.getValueType().isFloatingPoint()) {
6634 switch (SetCCOpcode) {
6635 default: llvm_unreachable("Illegal FP comparison");
6636 case ISD::SETUNE:
6637 case ISD::SETNE:
6638 if (ST->hasMVEFloatOps()) {
6639 Opc = ARMCC::NE; break;
6640 } else {
6641 Invert = true; LLVM_FALLTHROUGH;
6642 }
6643 case ISD::SETOEQ:
6644 case ISD::SETEQ: Opc = ARMCC::EQ; break;
6645 case ISD::SETOLT:
6646 case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH;
6647 case ISD::SETOGT:
6648 case ISD::SETGT: Opc = ARMCC::GT; break;
6649 case ISD::SETOLE:
6650 case ISD::SETLE: Swap = true; LLVM_FALLTHROUGH;
6651 case ISD::SETOGE:
6652 case ISD::SETGE: Opc = ARMCC::GE; break;
6653 case ISD::SETUGE: Swap = true; LLVM_FALLTHROUGH;
6654 case ISD::SETULE: Invert = true; Opc = ARMCC::GT; break;
6655 case ISD::SETUGT: Swap = true; LLVM_FALLTHROUGH;
6656 case ISD::SETULT: Invert = true; Opc = ARMCC::GE; break;
6657 case ISD::SETUEQ: Invert = true; LLVM_FALLTHROUGH;
6658 case ISD::SETONE: {
6659 // Expand this to (OLT | OGT).
6660 SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0,
6661 DAG.getConstant(ARMCC::GT, dl, MVT::i32));
6662 SDValue TmpOp1 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1,
6663 DAG.getConstant(ARMCC::GT, dl, MVT::i32));
6664 SDValue Result = DAG.getNode(ISD::OR, dl, CmpVT, TmpOp0, TmpOp1);
6665 if (Invert)
6666 Result = DAG.getNOT(dl, Result, VT);
6667 return Result;
6668 }
6669 case ISD::SETUO: Invert = true; LLVM_FALLTHROUGH;
6670 case ISD::SETO: {
6671 // Expand this to (OLT | OGE).
6672 SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0,
6673 DAG.getConstant(ARMCC::GT, dl, MVT::i32));
6674 SDValue TmpOp1 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1,
6675 DAG.getConstant(ARMCC::GE, dl, MVT::i32));
6676 SDValue Result = DAG.getNode(ISD::OR, dl, CmpVT, TmpOp0, TmpOp1);
6677 if (Invert)
6678 Result = DAG.getNOT(dl, Result, VT);
6679 return Result;
6680 }
6681 }
6682 } else {
6683 // Integer comparisons.
6684 switch (SetCCOpcode) {
6685 default: llvm_unreachable("Illegal integer comparison");
6686 case ISD::SETNE:
6687 if (ST->hasMVEIntegerOps()) {
6688 Opc = ARMCC::NE; break;
6689 } else {
6690 Invert = true; LLVM_FALLTHROUGH;
6691 }
6692 case ISD::SETEQ: Opc = ARMCC::EQ; break;
6693 case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH;
6694 case ISD::SETGT: Opc = ARMCC::GT; break;
6695 case ISD::SETLE: Swap = true; LLVM_FALLTHROUGH;
6696 case ISD::SETGE: Opc = ARMCC::GE; break;
6697 case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH;
6698 case ISD::SETUGT: Opc = ARMCC::HI; break;
6699 case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH;
6700 case ISD::SETUGE: Opc = ARMCC::HS; break;
6701 }
6702
6703 // Detect VTST (Vector Test Bits) = icmp ne (and (op0, op1), zero).
6704 if (ST->hasNEON() && Opc == ARMCC::EQ) {
6705 SDValue AndOp;
6707 AndOp = Op0;
6708 else if (ISD::isBuildVectorAllZeros(Op0.getNode()))
6709 AndOp = Op1;
6710
6711 // Ignore bitconvert.
6712 if (AndOp.getNode() && AndOp.getOpcode() == ISD::BITCAST)
6713 AndOp = AndOp.getOperand(0);
6714
6715 if (AndOp.getNode() && AndOp.getOpcode() == ISD::AND) {
6716 Op0 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(0));
6717 Op1 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(1));
6718 SDValue Result = DAG.getNode(ARMISD::VTST, dl, CmpVT, Op0, Op1);
6719 if (!Invert)
6720 Result = DAG.getNOT(dl, Result, VT);
6721 return Result;
6722 }
6723 }
6724 }
6725
6726 if (Swap)
6727 std::swap(Op0, Op1);
6728
6729 // If one of the operands is a constant vector zero, attempt to fold the
6730 // comparison to a specialized compare-against-zero form.
6733 SingleOp = Op0;
6734 else if (ISD::isBuildVectorAllZeros(Op0.getNode())) {
6735 if (Opc == ARMCC::GE)
6736 Opc = ARMCC::LE;
6737 else if (Opc == ARMCC::GT)
6738 Opc = ARMCC::LT;
6739 SingleOp = Op1;
6740 }
6741
6742 SDValue Result;
6743 if (SingleOp.getNode()) {
6744 Result = DAG.getNode(ARMISD::VCMPZ, dl, CmpVT, SingleOp,
6745 DAG.getConstant(Opc, dl, MVT::i32));
6746 } else {
6747 Result = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1,
6748 DAG.getConstant(Opc, dl, MVT::i32));
6749 }
6750
6751 Result = DAG.getSExtOrTrunc(Result, dl, VT);
6752
6753 if (Invert)
6754 Result = DAG.getNOT(dl, Result, VT);
6755
6756 return Result;
6757}
6758
6760 SDValue LHS = Op.getOperand(0);
6761 SDValue RHS = Op.getOperand(1);
6762 SDValue Carry = Op.getOperand(2);
6763 SDValue Cond = Op.getOperand(3);
6764 SDLoc DL(Op);
6765
6766 assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
6767
6768 // ARMISD::SUBE expects a carry not a borrow like ISD::SUBCARRY so we
6769 // have to invert the carry first.
6771 DAG.getConstant(1, DL, MVT::i32), Carry);
6772 // This converts the boolean value carry into the carry flag.
6774
6775 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
6776 SDValue Cmp = DAG.getNode(ARMISD::SUBE, DL, VTs, LHS, RHS, Carry);
6777
6778 SDValue FVal = DAG.getConstant(0, DL, MVT::i32);
6779 SDValue TVal = DAG.getConstant(1, DL, MVT::i32);
6782 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
6783 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), DL, ARM::CPSR,
6784 Cmp.getValue(1), SDValue());
6785 return DAG.getNode(ARMISD::CMOV, DL, Op.getValueType(), FVal, TVal, ARMcc,
6786 CCR, Chain.getValue(1));
6787}
6788
6789/// isVMOVModifiedImm - Check if the specified splat value corresponds to a
6790/// valid vector constant for a NEON or MVE instruction with a "modified
6791/// immediate" operand (e.g., VMOV). If so, return the encoded value.
6792static SDValue isVMOVModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
6793 unsigned SplatBitSize, SelectionDAG &DAG,
6794 const SDLoc &dl, EVT &VT, EVT VectorVT,
6795 VMOVModImmType type) {
6796 unsigned OpCmode, Imm;
6797 bool is128Bits = VectorVT.is128BitVector();
6798
6799 // SplatBitSize is set to the smallest size that splats the vector, so a
6800 // zero vector will always have SplatBitSize == 8. However, NEON modified
6801 // immediate instructions others than VMOV do not support the 8-bit encoding
6802 // of a zero vector, and the default encoding of zero is supposed to be the
6803 // 32-bit version.
6804 if (SplatBits == 0)
6805 SplatBitSize = 32;
6806
6807 switch (SplatBitSize) {
6808 case 8:
6809 if (type != VMOVModImm)
6810 return SDValue();
6811 // Any 1-byte value is OK. Op=0, Cmode=1110.
6812 assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big");
6813 OpCmode = 0xe;
6814 Imm = SplatBits;
6816 break;
6817
6818 case 16:
6819 // NEON's 16-bit VMOV supports splat values where only one byte is nonzero.
6821 if ((SplatBits & ~0xff) == 0) {
6822 // Value = 0x00nn: Op=x, Cmode=100x.
6823 OpCmode = 0x8;
6824 Imm = SplatBits;
6825 break;
6826 }
6827 if ((SplatBits & ~0xff00) == 0) {
6828 // Value = 0xnn00: Op=x, Cmode=101x.
6829 OpCmode = 0xa;
6830 Imm = SplatBits >> 8;
6831 break;
6832 }
6833 return SDValue();
6834
6835 case 32:
6836 // NEON's 32-bit VMOV supports splat values where:
6837 // * only one byte is nonzero, or
6838 // * the least significant byte is 0xff and the second byte is nonzero, or
6839 // * the least significant 2 bytes are 0xff and the third is nonzero.
6841 if ((SplatBits & ~0xff) == 0) {
6842 // Value = 0x000000nn: Op=x, Cmode=000x.
6843 OpCmode = 0;
6844 Imm = SplatBits;
6845 break;
6846 }
6847 if ((SplatBits & ~0xff00) == 0) {
6848 // Value = 0x0000nn00: Op=x, Cmode=001x.
6849 OpCmode = 0x2;
6850 Imm = SplatBits >> 8;
6851 break;
6852 }
6853 if ((SplatBits & ~0xff0000) == 0) {
6854 // Value = 0x00nn0000: Op=x, Cmode=010x.
6855 OpCmode = 0x4;
6856 Imm = SplatBits >> 16;
6857 break;
6858 }
6859 if ((SplatBits & ~0xff000000) == 0) {
6860 // Value = 0xnn000000: Op=x, Cmode=011x.
6861 OpCmode = 0x6;
6862 Imm = SplatBits >> 24;
6863 break;
6864 }
6865
6866 // cmode == 0b1100 and cmode == 0b1101 are not supported for VORR or VBIC
6867 if (type == OtherModImm) return SDValue();
6868
6869 if ((SplatBits & ~0xffff) == 0 &&
6870 ((SplatBits | SplatUndef) & 0xff) == 0xff) {
6871 // Value = 0x0000nnff: Op=x, Cmode=1100.
6872 OpCmode = 0xc;
6873 Imm = SplatBits >> 8;
6874 break;
6875 }
6876
6877 // cmode == 0b1101 is not supported for MVE VMVN
6878 if (type == MVEVMVNModImm)
6879 return SDValue();
6880
6881 if ((SplatBits & ~0xffffff) == 0 &&
6882 ((SplatBits | SplatUndef) & 0xffff) == 0xffff) {
6883 // Value = 0x00nnffff: Op=x, Cmode=1101.
6884 OpCmode = 0xd;
6885 Imm = SplatBits >> 16;
6886 break;
6887 }
6888
6889 // Note: there are a few 32-bit splat values (specifically: 00ffff00,
6890 // ff000000, ff0000ff, and ffff00ff) that are valid for VMOV.I64 but not
6891 // VMOV.I32. A (very) minor optimization would be to replicate the value
6892 // and fall through here to test for a valid 64-bit splat. But, then the
6893 // caller would also need to check and handle the change in size.
6894 return SDValue();
6895
6896 case 64: {
6897 if (type != VMOVModImm)
6898 return SDValue();
6899 // NEON has a 64-bit VMOV splat where each byte is either 0 or 0xff.
6900 uint64_t BitMask = 0xff;
6901 unsigned ImmMask = 1;
6902 Imm = 0;
6903 for (int ByteNum = 0; ByteNum < 8; ++ByteNum) {
6904 if (((SplatBits | SplatUndef) & BitMask) == BitMask) {
6905 Imm |= ImmMask;
6906 } else if ((SplatBits & BitMask) != 0) {
6907 return SDValue();
6908 }
6909 BitMask <<= 8;
6910 ImmMask <<= 1;
6911 }
6912
6913 if (DAG.getDataLayout().isBigEndian()) {
6914 // Reverse the order of elements within the vector.
6915 unsigned BytesPerElem = VectorVT.getScalarSizeInBits() / 8;
6916 unsigned Mask = (1 << BytesPerElem) - 1;
6917 unsigned NumElems = 8 / BytesPerElem;
6918 unsigned NewImm = 0;
6919 for (unsigned ElemNum = 0; ElemNum < NumElems; ++ElemNum) {
6920 unsigned Elem = ((Imm >> ElemNum * BytesPerElem) & Mask);
6921 NewImm |= Elem << (NumElems - ElemNum - 1) * BytesPerElem;
6922 }
6923 Imm = NewImm;
6924 }
6925
6926 // Op=1, Cmode=1110.
6927 OpCmode = 0x1e;
6929 break;
6930 }
6931
6932 default:
6933 llvm_unreachable("unexpected size for isVMOVModifiedImm");
6934 }
6935
6937 return DAG.getTargetConstant(EncodedVal, dl, MVT::i32);
6938}
6939
6940SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,
6941 const ARMSubtarget *ST) const {
6942 EVT VT = Op.getValueType();
6943 bool IsDouble = (VT == MVT::f64);
6945 const APFloat &FPVal = CFP->getValueAPF();
6946
6947 // Prevent floating-point constants from using literal loads
6948 // when execute-only is enabled.
6949 if (ST->genExecuteOnly()) {
6950 // If we can represent the constant as an immediate, don't lower it
6951 if (isFPImmLegal(FPVal, VT))
6952 return Op;
6953 // Otherwise, construct as integer, and move to float register
6954 APInt INTVal = FPVal.bitcastToAPInt();
6955 SDLoc DL(CFP);
6956 switch (VT.getSimpleVT().SimpleTy) {
6957 default:
6958 llvm_unreachable("Unknown floating point type!");
6959 break;
6960 case MVT::f64: {
6961 SDValue Lo = DAG.getConstant(INTVal.trunc(32), DL, MVT::i32);
6962 SDValue Hi = DAG.getConstant(INTVal.lshr(32).trunc(32), DL, MVT::i32);
6963 return DAG.getNode(ARMISD::VMOVDRR, DL, MVT::f64, Lo, Hi);
6964 }
6965 case MVT::f32:
6966 return DAG.getNode(ARMISD::VMOVSR, DL, VT,
6967 DAG.getConstant(INTVal, DL, MVT::i32));
6968 }
6969 }
6970
6971 if (!ST->hasVFP3Base())
6972 return SDValue();
6973
6974 // Use the default (constant pool) lowering for double constants when we have
6975 // an SP-only FPU
6976 if (IsDouble && !Subtarget->hasFP64())
6977 return SDValue();
6978
6979 // Try splatting with a VMOV.f32...
6980 int ImmVal = IsDouble ? ARM_AM::getFP64Imm(FPVal) : ARM_AM::getFP32Imm(FPVal);
6981
6982 if (ImmVal != -1) {
6983 if (IsDouble || !ST->useNEONForSinglePrecisionFP()) {
6984 // We have code in place to select a valid ConstantFP already, no need to
6985 // do any mangling.
6986 return Op;
6987 }
6988
6989 // It's a float and we are trying to use NEON operations where
6990 // possible. Lower it to a splat followed by an extract.
6991 SDLoc DL(Op);
6992 SDValue NewVal = DAG.getTargetConstant(ImmVal, DL, MVT::i32);
6994 NewVal);
6996 DAG.getConstant(0, DL, MVT::i32));
6997 }
6998
6999 // The rest of our options are NEON only, make sure that's allowed before
7000 // proceeding..
7001 if (!ST->hasNEON() || (!IsDouble && !ST->useNEONForSinglePrecisionFP()))
7002 return SDValue();
7003
7004 EVT VMovVT;
7005 uint64_t iVal = FPVal.bitcastToAPInt().getZExtValue();
7006
7007 // It wouldn't really be worth bothering for doubles except for one very
7008 // important value, which does happen to match: 0.0. So make sure we don't do
7009 // anything stupid.
7010 if (IsDouble && (iVal & 0xffffffff) != (iVal >> 32))
7011 return SDValue();
7012
7013 // Try a VMOV.i32 (FIXME: i8, i16, or i64 could work too).
7014 SDValue NewVal = isVMOVModifiedImm(iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op),
7015 VMovVT, VT, VMOVModImm);
7016 if (NewVal != SDValue()) {
7017 SDLoc DL(Op);
7019 NewVal);
7020 if (IsDouble)
7022
7023 // It's a float: cast and extract a vector element.
7025 VecConstant);
7027 DAG.getConstant(0, DL, MVT::i32));
7028 }
7029
7030 // Finally, try a VMVN.i32
7031 NewVal = isVMOVModifiedImm(~iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), VMovVT,
7032 VT, VMVNModImm);
7033 if (NewVal != SDValue()) {
7034 SDLoc DL(Op);
7036
7037 if (IsDouble)
7039
7040 // It's a float: cast and extract a vector element.
7042 VecConstant);
7044 DAG.getConstant(0, DL, MVT::i32));
7045 }
7046
7047 return SDValue();
7048}
7049
7050// check if an VEXT instruction can handle the shuffle mask when the
7051// vector sources of the shuffle are the same.
7052static bool isSingletonVEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
7053 unsigned NumElts = VT.getVectorNumElements();
7054
7055 // Assume that the first shuffle index is not UNDEF. Fail if it is.
7056 if (M[0] < 0)
7057 return false;
7058
7059 Imm = M[0];
7060
7061 // If this is a VEXT shuffle, the immediate value is the index of the first
7062 // element. The other shuffle indices must be the successive elements after
7063 // the first one.
7064 unsigned ExpectedElt = Imm;
7065 for (unsigned i = 1; i < NumElts; ++i) {
7066 // Increment the expected index. If it wraps around, just follow it
7067 // back to index zero and keep going.
7068 ++ExpectedElt;
7069 if (ExpectedElt == NumElts)
7070 ExpectedElt = 0;
7071
7072 if (M[i] < 0) continue; // ignore UNDEF indices
7073 if (ExpectedElt != static_cast<unsigned>(M[i]))
7074 return false;
7075 }
7076
7077 return true;
7078}
7079
7080static bool isVEXTMask(ArrayRef<int> M, EVT VT,
7081 bool &ReverseVEXT, unsigned &Imm) {
7082 unsigned NumElts = VT.getVectorNumElements();
7083 ReverseVEXT = false;
7084
7085 // Assume that the first shuffle index is not UNDEF. Fail if it is.
7086 if (M[0] < 0)
7087 return false;
7088
7089 Imm = M[0];
7090
7091 // If this is a VEXT shuffle, the immediate value is the index of the first
7092 // element. The other shuffle indices must be the successive elements after
7093 // the first one.
7094 unsigned ExpectedElt = Imm;
7095 for (unsigned i = 1; i < NumElts; ++i) {
7096 // Increment the expected index. If it wraps around, it may still be
7097 // a VEXT but the source vectors must be swapped.
7098 ExpectedElt += 1;
7099 if (ExpectedElt == NumElts * 2) {
7100 ExpectedElt = 0;
7101 ReverseVEXT = true;
7102 }
7103
7104 if (M[i] < 0) continue; // ignore UNDEF indices
7105 if (ExpectedElt != static_cast<unsigned>(M[i]))
7106 return false;
7107 }
7108
7109 // Adjust the index value if the source operands will be swapped.
7110 if (ReverseVEXT)
7111 Imm -= NumElts;
7112
7113 return true;
7114}
7115
7116static bool isVTBLMask(ArrayRef<int> M, EVT VT) {
7117 // We can handle <8 x i8> vector shuffles. If the index in the mask is out of
7118 // range, then 0 is placed into the resulting vector. So pretty much any mask
7119 // of 8 elements can work here.
7120 return VT == MVT::v8i8 && M.size() == 8;
7121}
7122
7123static unsigned SelectPairHalf(unsigned Elements, ArrayRef<int> Mask,
7124 unsigned Index) {
7125 if (Mask.size() == Elements * 2)
7126 return Index / Elements;
7127 return Mask[Index] == 0 ? 0 : 1;
7128}
7129
7130// Checks whether the shuffle mask represents a vector transpose (VTRN) by
7131// checking that pairs of elements in the shuffle mask represent the same index
7132// in each vector, incrementing the expected index by 2 at each step.
7133// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 2, 6]
7134// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,c,g}
7135// v2={e,f,g,h}
7136// WhichResult gives the offset for each element in the mask based on which
7137// of the two results it belongs to.
7138//
7139// The transpose can be represented either as:
7140// result1 = shufflevector v1, v2, result1_shuffle_mask
7141// result2 = shufflevector v1, v2, result2_shuffle_mask
7142// where v1/v2 and the shuffle masks have the same number of elements
7143// (here WhichResult (see below) indicates which result is being checked)
7144//
7145// or as:
7146// results = shufflevector v1, v2, shuffle_mask
7147// where both results are returned in one vector and the shuffle mask has twice
7148// as many elements as v1/v2 (here WhichResult will always be 0 if true) here we
7149// want to check the low half and high half of the shuffle mask as if it were
7150// the other case
7151static bool isVTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
7152 unsigned EltSz = VT.getScalarSizeInBits();
7153 if (EltSz == 64)
7154 return false;
7155
7156 unsigned NumElts = VT.getVectorNumElements();
7157 if (M.size() != NumElts && M.size() != NumElts*2)
7158 return false;
7159
7160 // If the mask is twice as long as the input vector then we need to check the
7161 // upper and lower parts of the mask with a matching value for WhichResult
7162 // FIXME: A mask with only even values will be rejected in case the first
7163 // element is undefined, e.g. [-1, 4, 2, 6] will be rejected, because only
7164 // M[0] is used to determine WhichResult
7165 for (unsigned i = 0; i < M.size(); i += NumElts) {
7167 for (unsigned j = 0; j < NumElts; j += 2) {
7168 if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) ||
7169 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + NumElts + WhichResult))
7170 return false;
7171 }
7172 }
7173
7174 if (M.size() == NumElts*2)
7175 WhichResult = 0;
7176
7177 return true;
7178}
7179
7180/// isVTRN_v_undef_Mask - Special case of isVTRNMask for canonical form of
7181/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
7182/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
7183static bool isVTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
7184 unsigned EltSz = VT.getScalarSizeInBits();
7185 if (EltSz == 64)
7186 return false;
7187
7188 unsigned NumElts = VT.getVectorNumElements();
7189 if (M.size() != NumElts && M.size() != NumElts*2)
7190 return false;
7191
7192 for (unsigned i = 0; i < M.size(); i += NumElts) {
7194 for (unsigned j = 0; j < NumElts; j += 2) {
7195 if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) ||
7196 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + WhichResult))
7197 return false;
7198 }
7199 }
7200
7201 if (M.size() == NumElts*2)
7202 WhichResult = 0;
7203
7204 return true;
7205}
7206
7207// Checks whether the shuffle mask represents a vector unzip (VUZP) by checking
7208// that the mask elements are either all even and in steps of size 2 or all odd
7209// and in steps of size 2.
7210// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 2, 4, 6]
7211// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,c,e,g}
7212// v2={e,f,g,h}
7213// Requires similar checks to that of isVTRNMask with
7214// respect the how results are returned.
7215static bool isVUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
7216 unsigned EltSz = VT.getScalarSizeInBits();
7217 if (EltSz == 64)
7218 return false;
7219
7220 unsigned NumElts = VT.getVectorNumElements();
7221 if (M.size() != NumElts && M.size() != NumElts*2)
7222 return false;
7223
7224 for (unsigned i = 0; i < M.size(); i += NumElts) {
7226 for (unsigned j = 0; j < NumElts; ++j) {
7227 if (M[i+j] >= 0 && (unsigned) M[i+j] != 2 * j + WhichResult)
7228 return false;
7229 }
7230 }
7231
7232 if (M.size() == NumElts*2)
7233 WhichResult = 0;
7234
7235 // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
7236 if (VT.is64BitVector() && EltSz == 32)
7237 return false;
7238
7239 return true;
7240}
7241
7242/// isVUZP_v_undef_Mask - Special case of isVUZPMask for canonical form of
7243/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
7244/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
7245static bool isVUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
7246 unsigned EltSz = VT.getScalarSizeInBits();
7247 if (EltSz == 64)
7248 return false;
7249
7250 unsigned NumElts = VT.getVectorNumElements();
7251 if (M.size() != NumElts && M.size() != NumElts*2)
7252 return false;
7253
7254 unsigned Half = NumElts / 2;
7255 for (unsigned i = 0; i < M.size(); i += NumElts) {
7257 for (unsigned j = 0; j < NumElts; j += Half) {
7258 unsigned Idx = WhichResult;
7259 for (unsigned k = 0; k < Half; ++k) {
7260 int MIdx = M[i + j + k];
7261 if (MIdx >= 0 && (unsigned) MIdx != Idx)
7262 return false;
7263 Idx += 2;
7264 }
7265 }
7266 }
7267
7268 if (M.size() == NumElts*2)
7269 WhichResult = 0;
7270
7271 // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
7272 if (VT.is64BitVector() && EltSz == 32)
7273 return false;
7274
7275 return true;
7276}
7277
7278// Checks whether the shuffle mask represents a vector zip (VZIP) by checking
7279// that pairs of elements of the shufflemask represent the same index in each
7280// vector incrementing sequentially through the vectors.
7281// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 1, 5]
7282// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,b,f}
7283// v2={e,f,g,h}
7284// Requires similar checks to that of isVTRNMask with respect the how results
7285// are returned.
7286static bool isVZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
7287 unsigned EltSz = VT.getScalarSizeInBits();
7288 if (EltSz == 64)
7289 return false;
7290
7291 unsigned NumElts = VT.getVectorNumElements();
7292 if (M.size() != NumElts && M.size() != NumElts*2)
7293 return false;
7294
7295 for (unsigned i = 0; i < M.size(); i += NumElts) {
7297 unsigned Idx = WhichResult * NumElts / 2;
7298 for (unsigned j = 0; j < NumElts; j += 2) {
7299 if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) ||
7300 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx + NumElts))
7301 return false;
7302 Idx += 1;
7303 }
7304 }
7305
7306 if (M.size() == NumElts*2)
7307 WhichResult = 0;
7308
7309 // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
7310 if (VT.is64BitVector() && EltSz == 32)
7311 return false;
7312
7313 return true;
7314}
7315
7316/// isVZIP_v_undef_Mask - Special case of isVZIPMask for canonical form of
7317/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
7318/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
7319static bool isVZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
7320 unsigned EltSz = VT.getScalarSizeInBits();
7321 if (EltSz == 64)
7322 return false;
7323
7324 unsigned NumElts = VT.getVectorNumElements();
7325 if (M.size() != NumElts && M.size() != NumElts*2)
7326 return false;
7327
7328 for (unsigned i = 0; i < M.size(); i += NumElts) {
7330 unsigned Idx = WhichResult * NumElts / 2;
7331 for (unsigned j = 0; j < NumElts; j += 2) {
7332 if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) ||
7333 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx))
7334 return false;
7335 Idx += 1;
7336 }
7337 }
7338
7339 if (M.size() == NumElts*2)
7340 WhichResult = 0;
7341
7342 // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
7343 if (VT.is64BitVector() && EltSz == 32)
7344 return false;
7345
7346 return true;
7347}
7348
7349/// Check if \p ShuffleMask is a NEON two-result shuffle (VZIP, VUZP, VTRN),
7350/// and return the corresponding ARMISD opcode if it is, or 0 if it isn't.
7351static unsigned isNEONTwoResultShuffleMask(ArrayRef<int> ShuffleMask, EVT VT,
7352 unsigned &WhichResult,
7353 bool &isV_UNDEF) {
7354 isV_UNDEF = false;
7355 if (isVTRNMask(ShuffleMask, VT, WhichResult))
7356 return ARMISD::VTRN;
7357 if (isVUZPMask(ShuffleMask, VT, WhichResult))
7358 return ARMISD::VUZP;
7359 if (isVZIPMask(ShuffleMask, VT, WhichResult))
7360 return ARMISD::VZIP;
7361
7362 isV_UNDEF = true;
7363 if (isVTRN_v_undef_Mask(ShuffleMask, VT, WhichResult))
7364 return ARMISD::VTRN;
7365 if (isVUZP_v_undef_Mask(ShuffleMask, VT, WhichResult))
7366 return ARMISD::VUZP;
7367 if (isVZIP_v_undef_Mask(ShuffleMask, VT, WhichResult))
7368 return ARMISD::VZIP;
7369
7370 return 0;
7371}
7372
7373/// \return true if this is a reverse operation on an vector.
7374static bool isReverseMask(ArrayRef<int> M, EVT VT) {
7375 unsigned NumElts = VT.getVectorNumElements();
7376 // Make sure the mask has the right size.
7377 if (NumElts != M.size())
7378 return false;
7379
7380 // Look for <15, ..., 3, -1, 1, 0>.
7381 for (unsigned i = 0; i != NumElts; ++i)
7382 if (M[i] >= 0 && M[i] != (int) (NumElts - 1 - i))
7383 return false;
7384
7385 return true;
7386}
7387
7388static bool isVMOVNMask(ArrayRef<int> M, EVT VT, bool Top, bool SingleSource) {
7389 unsigned NumElts = VT.getVectorNumElements();
7390 // Make sure the mask has the right size.
7391 if (NumElts != M.size() || (VT != MVT::v8i16 && VT != MVT::v16i8))
7392 return false;
7393
7394 // If Top
7395 // Look for <0, N, 2, N+2, 4, N+4, ..>.
7396 // This inserts Input2 into Input1
7397 // else if not Top
7398 // Look for <0, N+1, 2, N+3, 4, N+5, ..>
7399 // This inserts Input1 into Input2
7400 unsigned Offset = Top ? 0 : 1;
7401 unsigned N = SingleSource ? 0 : NumElts;
7402 for (unsigned i = 0; i < NumElts; i += 2) {
7403 if (M[i] >= 0 && M[i] != (int)i)
7404 return false;
7405 if (M[i + 1] >= 0 && M[i + 1] != (int)(N + i + Offset))
7406 return false;
7407 }
7408
7409 return true;
7410}
7411
7413 unsigned NumElts = ToVT.getVectorNumElements();
7414 if (NumElts != M.size())
7415 return false;
7416
7417 // Test if the Trunc can be convertable to a VMOVN with this shuffle. We are
7418 // looking for patterns of:
7419 // !rev: 0 N/2 1 N/2+1 2 N/2+2 ...
7420 // rev: N/2 0 N/2+1 1 N/2+2 2 ...
7421
7422 unsigned Off0 = rev ? NumElts / 2 : 0;
7423 unsigned Off1 = rev ? 0 : NumElts / 2;
7424 for (unsigned i = 0; i < NumElts; i += 2) {
7425 if (M[i] >= 0 && M[i] != (int)(Off0 + i / 2))
7426 return false;
7427 if (M[i + 1] >= 0 && M[i + 1] != (int)(Off1 + i / 2))
7428 return false;
7429 }
7430
7431 return true;
7432}
7433
7434// Reconstruct an MVE VCVT from a BuildVector of scalar fptrunc, all extracted
7435// from a pair of inputs. For example:
7436// BUILDVECTOR(FP_ROUND(EXTRACT_ELT(X, 0),
7437// FP_ROUND(EXTRACT_ELT(Y, 0),
7438// FP_ROUND(EXTRACT_ELT(X, 1),
7439// FP_ROUND(EXTRACT_ELT(Y, 1), ...)
7441 const ARMSubtarget *ST) {
7442 assert(BV.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
7443 if (!ST->hasMVEFloatOps())
7444 return SDValue();
7445
7446 SDLoc dl(BV);
7447 EVT VT = BV.getValueType();
7448 if (VT != MVT::v8f16)
7449 return SDValue();
7450
7451 // We are looking for a buildvector of fptrunc elements, where all the
7452 // elements are interleavingly extracted from two sources. Check the first two
7453 // items are valid enough and extract some info from them (they are checked
7454 // properly in the loop below).
7455 if (BV.getOperand(0).getOpcode() != ISD::FP_ROUND ||
7456 BV.getOperand(0).getOperand(0).getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7457 BV.getOperand(0).getOperand(0).getConstantOperandVal(1) != 0)
7458 return SDValue();
7459 if (BV.getOperand(1).getOpcode() != ISD::FP_ROUND ||
7460 BV.getOperand(1).getOperand(0).getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7461 BV.getOperand(1).getOperand(0).getConstantOperandVal(1) != 0)
7462 return SDValue();
7463 SDValue Op0 = BV.getOperand(0).getOperand(0).getOperand(0);
7464 SDValue Op1 = BV.getOperand(1).getOperand(0).getOperand(0);
7465 if (Op0.getValueType() != MVT::v4f32 || Op1.getValueType() != MVT::v4f32)
7466 return SDValue();
7467
7468 // Check all the values in the BuildVector line up with our expectations.
7469 for (unsigned i = 1; i < 4; i++) {
7470 auto Check = [](SDValue Trunc, SDValue Op, unsigned Idx) {
7471 return Trunc.getOpcode() == ISD::FP_ROUND &&
7473 Trunc.getOperand(0).getOperand(0) == Op &&
7474 Trunc.getOperand(0).getConstantOperandVal(1) == Idx;
7475 };
7476 if (!Check(BV.getOperand(i * 2 + 0), Op0, i))
7477 return SDValue();
7478 if (!Check(BV.getOperand(i * 2 + 1), Op1, i))
7479 return SDValue();
7480 }
7481
7482 SDValue N1 = DAG.getNode(ARMISD::VCVTN, dl, VT, DAG.getUNDEF(VT), Op0,
7483 DAG.getConstant(0, dl, MVT::i32));
7484 return DAG.getNode(ARMISD::VCVTN, dl, VT, N1, Op1,
7485 DAG.getConstant(1, dl, MVT::i32));
7486}
7487
7488// Reconstruct an MVE VCVT from a BuildVector of scalar fpext, all extracted
7489// from a single input on alternating lanes. For example:
7490// BUILDVECTOR(FP_ROUND(EXTRACT_ELT(X, 0),
7491// FP_ROUND(EXTRACT_ELT(X, 2),
7492// FP_ROUND(EXTRACT_ELT(X, 4), ...)
7494 const ARMSubtarget *ST) {
7495 assert(BV.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
7496 if (!ST->hasMVEFloatOps())
7497 return SDValue();
7498
7499 SDLoc dl(BV);
7500 EVT VT = BV.getValueType();
7501 if (VT != MVT::v4f32)
7502 return SDValue();
7503
7504 // We are looking for a buildvector of fptext elements, where all the
7505 // elements are alternating lanes from a single source. For example <0,2,4,6>
7506 // or <1,3,5,7>. Check the first two items are valid enough and extract some
7507 // info from them (they are checked properly in the loop below).
7508 if (BV.getOperand(0).getOpcode() != ISD::FP_EXTEND ||
7509 BV.getOperand(0).getOperand(0).getOpcode() != ISD::EXTRACT_VECTOR_ELT)
7510 return SDValue();
7511 SDValue Op0 = BV.getOperand(0).getOperand(0).getOperand(0);
7512 int Offset = BV.getOperand(0).getOperand(0).getConstantOperandVal(1);
7513 if (Op0.getValueType() != MVT::v8f16 || (Offset != 0 && Offset != 1))
7514 return SDValue();
7515
7516 // Check all the values in the BuildVector line up with our expectations.
7517 for (unsigned i = 1; i < 4; i++) {
7518 auto Check = [](SDValue Trunc, SDValue Op, unsigned Idx) {
7519 return Trunc.getOpcode() == ISD::FP_EXTEND &&
7521 Trunc.getOperand(0).getOperand(0) == Op &&
7522 Trunc.getOperand(0).getConstantOperandVal(1) == Idx;
7523 };
7524 if (!Check(BV.getOperand(i), Op0, 2 * i + Offset))
7525 return SDValue();
7526 }
7527
7528 return DAG.getNode(ARMISD::VCVTL, dl, VT, Op0,
7529 DAG.getConstant(Offset, dl, MVT::i32));
7530}
7531
7532// If N is an integer constant that can be moved into a register in one
7533// instruction, return an SDValue of such a constant (will become a MOV
7534// instruction). Otherwise return null.
7536 const ARMSubtarget *ST, const SDLoc &dl) {
7537 uint64_t Val;
7538 if (!isa<ConstantSDNode>(N))
7539 return SDValue();
7540 Val = cast<ConstantSDNode>(N)->getZExtValue();
7541
7542 if (ST->isThumb1Only()) {
7543 if (Val <= 255 || ~Val <= 255)
7544 return DAG.getConstant(Val, dl, MVT::i32);
7545 } else {
7546 if (ARM_AM::getSOImmVal(Val) != -1 || ARM_AM::getSOImmVal(~Val) != -1)
7547 return DAG.getConstant(Val, dl, MVT::i32);
7548 }
7549 return SDValue();
7550}
7551
7553 const ARMSubtarget *ST) {
7554 SDLoc dl(Op);
7555 EVT VT = Op.getValueType();
7556
7557 assert(ST->hasMVEIntegerOps() && "LowerBUILD_VECTOR_i1 called without MVE!");
7558
7559 unsigned NumElts = VT.getVectorNumElements();
7560 unsigned BoolMask;
7561 unsigned BitsPerBool;
7562 if (NumElts == 4) {
7563 BitsPerBool = 4;
7564 BoolMask = 0xf;
7565 } else if (NumElts == 8) {
7566 BitsPerBool = 2;
7567 BoolMask = 0x3;
7568 } else if (NumElts == 16) {
7569 BitsPerBool = 1;
7570 BoolMask = 0x1;
7571 } else
7572 return SDValue();
7573
7574 // If this is a single value copied into all lanes (a splat), we can just sign
7575 // extend that single value
7576 SDValue FirstOp = Op.getOperand(0);
7577 if (!isa<ConstantSDNode>(FirstOp) &&
7578 std::all_of(std::next(Op->op_begin()), Op->op_end(),
7579 [&FirstOp](SDUse &U) {
7580 return U.get().isUndef() || U.get() == FirstOp;
7581 })) {
7582 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i32, FirstOp,
7583 DAG.getValueType(MVT::i1));
7584 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, Op.getValueType(), Ext);
7585 }
7586
7587 // First create base with bits set where known
7588 unsigned Bits32 = 0;
7589 for (unsigned i = 0; i < NumElts; ++i) {
7590 SDValue V = Op.getOperand(i);
7591 if (!isa<ConstantSDNode>(V) && !V.isUndef())
7592 continue;
7593 bool BitSet = V.isUndef() ? false : cast<ConstantSDNode>(V)->getZExtValue();
7594 if (BitSet)
7595 Bits32 |= BoolMask << (i * BitsPerBool);
7596 }
7597
7598 // Add in unknown nodes
7600 DAG.getConstant(Bits32, dl, MVT::i32));
7601 for (unsigned i = 0; i < NumElts; ++i) {
7602 SDValue V = Op.getOperand(i);
7603 if (isa<ConstantSDNode>(V) || V.isUndef())
7604 continue;
7605 Base = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Base, V,
7606 DAG.getConstant(i, dl, MVT::i32));
7607 }
7608
7609 return Base;
7610}
7611
7613 const ARMSubtarget *ST) {
7614 if (!ST->hasMVEIntegerOps())
7615 return SDValue();
7616
7617 // We are looking for a buildvector where each element is Op[0] + i*N
7618 EVT VT = Op.getValueType();
7619 SDValue Op0 = Op.getOperand(0);
7620 unsigned NumElts = VT.getVectorNumElements();
7621
7622 // Get the increment value from operand 1
7623 SDValue Op1 = Op.getOperand(1);
7624 if (Op1.getOpcode() != ISD::ADD || Op1.getOperand(0) != Op0 ||
7626 return SDValue();
7627 unsigned N = Op1.getConstantOperandVal(1);
7628 if (N != 1 && N != 2 && N != 4 && N != 8)
7629 return SDValue();
7630
7631 // Check that each other operand matches
7632 for (unsigned I = 2; I < NumElts; I++) {
7633 SDValue OpI = Op.getOperand(I);
7634 if (OpI.getOpcode() != ISD::ADD || OpI.getOperand(0) != Op0 ||
7636 OpI.getConstantOperandVal(1) != I * N)
7637 return SDValue();
7638 }
7639
7640 SDLoc DL(Op);
7641 return DAG.getNode(ARMISD::VIDUP, DL, DAG.getVTList(VT, MVT::i32), Op0,
7642 DAG.getConstant(N, DL, MVT::i32));
7643}
7644
7645// If this is a case we can't handle, return null and let the default
7646// expansion code take care of it.
7647SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
7648 const ARMSubtarget *ST) const {
7650 SDLoc dl(Op);
7651 EVT VT = Op.getValueType();
7652
7653 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1)
7654 return LowerBUILD_VECTOR_i1(Op, DAG, ST);
7655
7656 if (SDValue R = LowerBUILD_VECTORToVIDUP(Op, DAG, ST))
7657 return R;
7658
7659 APInt SplatBits, SplatUndef;
7660 unsigned SplatBitSize;
7661 bool HasAnyUndefs;
7662 if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
7663 if (SplatUndef.isAllOnesValue())
7664 return DAG.getUNDEF(VT);
7665
7666 if ((ST->hasNEON() && SplatBitSize <= 64) ||
7667 (ST->hasMVEIntegerOps() && SplatBitSize <= 64)) {
7668 // Check if an immediate VMOV works.
7669 EVT VmovVT;
7670 SDValue Val =
7671 isVMOVModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(),
7672 SplatBitSize, DAG, dl, VmovVT, VT, VMOVModImm);
7673
7674 if (Val.getNode()) {
7675 SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, Val);
7676 return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
7677 }
7678
7679 // Try an immediate VMVN.
7680 uint64_t NegatedImm = (~SplatBits).getZExtValue();
7681 Val = isVMOVModifiedImm(
7682 NegatedImm, SplatUndef.getZExtValue(), SplatBitSize, DAG, dl, VmovVT,
7683 VT, ST->hasMVEIntegerOps() ? MVEVMVNModImm : VMVNModImm);
7684 if (Val.getNode()) {
7685 SDValue Vmov = DAG.getNode(ARMISD::VMVNIMM, dl, VmovVT, Val);
7686 return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
7687 }
7688
7689 // Use vmov.f32 to materialize other v2f32 and v4f32 splats.
7690 if ((VT == MVT::v2f32 || VT == MVT::v4f32) && SplatBitSize == 32) {
7691 int ImmVal = ARM_AM::getFP32Imm(SplatBits);
7692 if (ImmVal != -1) {
7693 SDValue Val = DAG.getTargetConstant(ImmVal, dl, MVT::i32);
7694 return DAG.getNode(ARMISD::VMOVFPIMM, dl, VT, Val);
7695 }
7696 }
7697
7698 // If we are under MVE, generate a VDUP(constant), bitcast to the original
7699 // type.
7700 if (ST->hasMVEIntegerOps() &&
7701 (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32)) {
7702 EVT DupVT = SplatBitSize == 32 ? MVT::v4i32
7703 : SplatBitSize == 16 ? MVT::v8i16
7704 : MVT::v16i8;
7705 SDValue Const = DAG.getConstant(SplatBits.getZExtValue(), dl, MVT::i32);
7706 SDValue VDup = DAG.getNode(ARMISD::VDUP, dl, DupVT, Const);
7707 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, VDup);
7708 }
7709 }
7710 }
7711
7712 // Scan through the operands to see if only one value is used.
7713 //
7714 // As an optimisation, even if more than one value is used it may be more
7715 // profitable to splat with one value then change some lanes.
7716 //
7717 // Heuristically we decide to do this if the vector has a "dominant" value,
7718 // defined as splatted to more than half of the lanes.
7719 unsigned NumElts = VT.getVectorNumElements();
7720 bool isOnlyLowElement = true;
7721 bool usesOnlyOneValue = true;
7722 bool hasDominantValue = false;
7723 bool isConstant = true;
7724
7725 // Map of the number of times a particular SDValue appears in the
7726 // element list.
7727 DenseMap<SDValue, unsigned> ValueCounts;
7728 SDValue Value;
7729 for (unsigned i = 0; i < NumElts; ++i) {
7730 SDValue V = Op.getOperand(i);
7731 if (V.isUndef())
7732 continue;
7733 if (i > 0)
7734 isOnlyLowElement = false;
7736 isConstant = false;
7737
7738 ValueCounts.insert(std::make_pair(V, 0));
7739 unsigned &Count = ValueCounts[V];
7740
7741 // Is this value dominant? (takes up more than half of the lanes)
7742 if (++Count > (NumElts / 2)) {
7743 hasDominantValue = true;
7744 Value = V;
7745 }
7746 }
7747 if (ValueCounts.size() != 1)
7748 usesOnlyOneValue = false;
7749 if (!Value.getNode() && !ValueCounts.empty())
7750 Value = ValueCounts.begin()->first;
7751
7752 if (ValueCounts.empty())
7753 return DAG.getUNDEF(VT);
7754
7755 // Loads are better lowered with insert_vector_elt/ARMISD::BUILD_VECTOR.
7756 // Keep going if we are hitting this case.
7757 if (isOnlyLowElement && !ISD::isNormalLoad(Value.getNode()))
7758 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
7759
7760 unsigned EltSize = VT.getScalarSizeInBits();
7761
7762 // Use VDUP for non-constant splats. For f32 constant splats, reduce to
7763 // i32 and try again.
7764 if (hasDominantValue && EltSize <= 32) {
7765 if (!isConstant) {
7766 SDValue N;
7767
7768 // If we are VDUPing a value that comes directly from a vector, that will
7769 // cause an unnecessary move to and from a GPR, where instead we could
7770 // just use VDUPLANE. We can only do this if the lane being extracted
7771 // is at a constant index, as the VDUP from lane instructions only have
7772 // constant-index forms.
7774 if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7775 (constIndex = dyn_cast<ConstantSDNode>(Value->getOperand(1)))) {
7776 // We need to create a new undef vector to use for the VDUPLANE if the
7777 // size of the vector from which we get the value is different than the
7778 // size of the vector that we need to create. We will insert the element
7779 // such that the register coalescer will remove unnecessary copies.
7780 if (VT != Value->getOperand(0).getValueType()) {
7781 unsigned index = constIndex->getAPIntValue().getLimitedValue() %
7783 N = DAG.getNode(ARMISD::VDUPLANE, dl, VT,
7784 DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DAG.getUNDEF(VT),
7785 Value, DAG.getConstant(index, dl, MVT::i32)),
7786 DAG.getConstant(index, dl, MVT::i32));
7787 } else
7788 N = DAG.getNode(ARMISD::VDUPLANE, dl, VT,
7789 Value->getOperand(0), Value->getOperand(1));
7790 } else
7791 N = DAG.getNode(ARMISD::VDUP, dl, VT, Value);
7792
7793 if (!usesOnlyOneValue) {
7794 // The dominant value was splatted as 'N', but we now have to insert
7795 // all differing elements.
7796 for (unsigned I = 0; I < NumElts; ++I) {
7797 if (Op.getOperand(I) == Value)
7798 continue;
7800 Ops.push_back(N);
7801 Ops.push_back(Op.getOperand(I));
7802 Ops.push_back(DAG.getConstant(I, dl, MVT::i32));
7803 N = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ops);
7804 }
7805 }
7806 return N;
7807 }
7811 assert(FVT == MVT::f32 || FVT == MVT::f16);
7812 MVT IVT = (FVT == MVT::f32) ? MVT::i32 : MVT::i16;
7813 for (unsigned i = 0; i < NumElts; ++i)
7814 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, IVT,
7815 Op.getOperand(i)));
7816 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), IVT, NumElts);
7817 SDValue Val = DAG.getBuildVector(VecVT, dl, Ops);
7818 Val = LowerBUILD_VECTOR(Val, DAG, ST);
7819 if (Val.getNode())
7820 return DAG.getNode(ISD::BITCAST, dl, VT, Val);
7821 }
7822 if (usesOnlyOneValue) {
7823 SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl);
7824 if (isConstant && Val.getNode())
7825 return DAG.getNode(ARMISD::VDUP, dl, VT, Val);
7826 }
7827 }
7828
7829 // If all elements are constants and the case above didn't get hit, fall back
7830 // to the default expansion, which will generate a load from the constant
7831 // pool.
7832 if (isConstant)
7833 return SDValue();
7834
7835 // Reconstruct the BUILDVECTOR to one of the legal shuffles (such as vext and
7836 // vmovn). Empirical tests suggest this is rarely worth it for vectors of
7837 // length <= 2.
7838 if (NumElts >= 4)
7839 if (SDValue shuffle = ReconstructShuffle(Op, DAG))
7840 return shuffle;
7841
7842 // Attempt to turn a buildvector of scalar fptrunc's or fpext's back into
7843 // VCVT's
7844 if (SDValue VCVT = LowerBuildVectorOfFPTrunc(Op, DAG, Subtarget))
7845 return VCVT;
7846 if (SDValue VCVT = LowerBuildVectorOfFPExt(Op, DAG, Subtarget))
7847 return VCVT;
7848
7849 if (ST->hasNEON() && VT.is128BitVector() && VT != MVT::v2f64 && VT != MVT::v4f32) {
7850 // If we haven't found an efficient lowering, try splitting a 128-bit vector
7851 // into two 64-bit vectors; we might discover a better way to lower it.
7852 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElts);
7855 SDValue Lower =
7856 DAG.getBuildVector(HVT, dl, makeArrayRef(&Ops[0], NumElts / 2));
7857 if (Lower.getOpcode() == ISD::BUILD_VECTOR)
7858 Lower = LowerBUILD_VECTOR(Lower, DAG, ST);
7860 HVT, dl, makeArrayRef(&Ops[NumElts / 2], NumElts / 2));
7861 if (Upper.getOpcode() == ISD::BUILD_VECTOR)
7862 Upper = LowerBUILD_VECTOR(Upper, DAG, ST);
7863 if (Lower && Upper)
7864 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lower, Upper);
7865 }
7866
7867 // Vectors with 32- or 64-bit elements can be built by directly assigning
7868 // the subregisters. Lower it to an ARMISD::BUILD_VECTOR so the operands
7869 // will be legalized.
7870 if (EltSize >= 32) {
7871 // Do the expansion with floating-point types, since that is what the VFP
7872 // registers are defined to use, and since i64 is not legal.
7874 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);
7876 for (unsigned i = 0; i < NumElts; ++i)
7877 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, EltVT, Op.getOperand(i)));
7878 SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops);
7879 return DAG.getNode(ISD::BITCAST, dl, VT, Val);
7880 }
7881
7882 // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
7883 // know the default expansion would otherwise fall back on something even
7884 // worse. For a vector with one or two non-undef values, that's
7885 // scalar_to_vector for the elements followed by a shuffle (provided the
7886 // shuffle is valid for the target) and materialization element by element
7887 // on the stack followed by a load for everything else.
7888 if (!isConstant && !usesOnlyOneValue) {
7889 SDValue Vec = DAG.getUNDEF(VT);
7890 for (unsigned i = 0 ; i < NumElts; ++i) {
7891 SDValue V = Op.getOperand(i);
7892 if (V.isUndef())
7893 continue;
7894 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i32);
7895 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
7896 }
7897 return Vec;
7898 }
7899
7900 return SDValue();
7901}
7902
7903// Gather data to see if the operation can be modelled as a
7904// shuffle in combination with VEXTs.
7905SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
7906 SelectionDAG &DAG) const {
7907 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
7908 SDLoc dl(Op);
7909 EVT VT = Op.getValueType();
7910 unsigned NumElts = VT.getVectorNumElements();
7911
7912 struct ShuffleSourceInfo {
7913 SDValue Vec;
7914 unsigned MinElt = std::numeric_limits<unsigned>::max();
7915 unsigned MaxElt = 0;
7916
7917 // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
7918 // be compatible with the shuffle we intend to construct. As a result
7919 // ShuffleVec will be some sliding window into the original Vec.
7921
7922 // Code should guarantee that element i in Vec starts at element "WindowBase
7923 // + i * WindowScale in ShuffleVec".
7924 int WindowBase = 0;
7925 int WindowScale = 1;
7926
7927 ShuffleSourceInfo(SDValue Vec) : Vec(Vec), ShuffleVec(Vec) {}
7928
7929 bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
7930 };
7931
7932 // First gather all vectors used as an immediate source for this BUILD_VECTOR
7933 // node.
7935 for (unsigned i = 0; i < NumElts; ++i) {
7936 SDValue V = Op.getOperand(i);
7937 if (V.isUndef())
7938 continue;
7939 else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) {
7940 // A shuffle can only come from building a vector from various
7941 // elements of other vectors.
7942 return SDValue();
7943 } else if (!isa<ConstantSDNode>(V.getOperand(1))) {
7944 // Furthermore, shuffles require a constant mask, whereas extractelts
7945 // accept variable indices.
7946 return SDValue();
7947 }
7948
7949 // Add this element source to the list if it's not already there.
7950 SDValue SourceVec = V.getOperand(0);
7951 auto Source = llvm::find(Sources, SourceVec);
7952 if (Source == Sources.end())
7953 Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
7954
7955 // Update the minimum and maximum lane number seen.
7956 unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue();
7957 Source->MinElt = std::min(Source->MinElt, EltNo);
7958 Source->MaxElt = std::max(Source->MaxElt, EltNo);
7959 }
7960
7961 // Currently only do something sane when at most two source vectors
7962 // are involved.
7963 if (Sources.size() > 2)
7964 return SDValue();
7965
7966 // Find out the smallest element size among result and two sources, and use
7967 // it as element size to build the shuffle_vector.
7969 for (auto &Source : Sources) {
7970 EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
7971 if (SrcEltTy.bitsLT(SmallestEltTy))
7973 }
7974 unsigned ResMultiplier =
7975 VT.getScalarSizeInBits() / SmallestEltTy.getSizeInBits();
7976 NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits();
7978
7979 // If the source vector is too wide or too narrow, we may nevertheless be able
7980 // to construct a compatible shuffle either by concatenating it with UNDEF or
7981 // extracting a suitable range of elements.
7982 for (auto &Src : Sources) {
7983 EVT SrcVT = Src.ShuffleVec.getValueType();
7984
7985 uint64_t SrcVTSize = SrcVT.getFixedSizeInBits();
7986 uint64_t VTSize = VT.getFixedSizeInBits();
7987 if (SrcVTSize == VTSize)
7988 continue;
7989
7990 // This stage of the search produces a source with the same element type as
7991 // the original, but with a total width matching the BUILD_VECTOR output.
7992 EVT EltVT = SrcVT.getVectorElementType();
7993 unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits();
7995
7996 if (SrcVTSize < VTSize) {
7997 if (2 * SrcVTSize != VTSize)
7998 return SDValue();
7999 // We can pad out the smaller vector for free, so if it's part of a
8000 // shuffle...
8001 Src.ShuffleVec =
8002 DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec,
8003 DAG.getUNDEF(Src.ShuffleVec.getValueType()));
8004 continue;
8005 }
8006
8007 if (SrcVTSize != 2 * VTSize)
8008 return SDValue();
8009
8010 if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
8011 // Span too large for a VEXT to cope
8012 return SDValue();
8013 }
8014
8015 if (Src.MinElt >= NumSrcElts) {
8016 // The extraction can just take the second half
8017 Src.ShuffleVec =
8018 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
8019 DAG.getConstant(NumSrcElts, dl, MVT::i32));
8020 Src.WindowBase = -NumSrcElts;
8021 } else if (Src.MaxElt < NumSrcElts) {
8022 // The extraction can just take the first half
8023 Src.ShuffleVec =
8024 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
8025 DAG.getConstant(0, dl, MVT::i32));
8026 } else {
8027 // An actual VEXT is needed
8029 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
8030 DAG.getConstant(0, dl, MVT::i32));
8032 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
8033 DAG.getConstant(NumSrcElts, dl, MVT::i32));
8034
8035 Src.ShuffleVec = DAG.getNode(ARMISD::VEXT, dl, DestVT, VEXTSrc1,
8036 VEXTSrc2,
8037 DAG.getConstant(Src.MinElt, dl, MVT::i32));
8038 Src.WindowBase = -Src.MinElt;
8039 }
8040 }
8041
8042 // Another possible incompatibility occurs from the vector element types. We
8043 // can fix this by bitcasting the source vectors to the same type we intend
8044 // for the shuffle.
8045 for (auto &Src : Sources) {
8046 EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
8047 if (SrcEltTy == SmallestEltTy)
8048 continue;
8049 assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
8050 Src.ShuffleVec = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, ShuffleVT, Src.ShuffleVec);
8051 Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits();
8052 Src.WindowBase *= Src.WindowScale;
8053 }
8054
8055 // Final sanity check before we try to actually produce a shuffle.
8056 LLVM_DEBUG(for (auto Src
8057 : Sources)
8059
8060 // The stars all align, our next step is to produce the mask for the shuffle.
8061 SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
8062 int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
8063 for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
8064 SDValue Entry = Op.getOperand(i);
8065 if (Entry.isUndef())
8066 continue;
8067
8068 auto Src = llvm::find(Sources, Entry.getOperand(0));
8069 int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
8070
8071 // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
8072 // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
8073 // segment.
8074 EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
8075 int BitsDefined = std::min(OrigEltTy.getScalarSizeInBits(),
8076 VT.getScalarSizeInBits());
8078
8079 // This source is expected to fill ResMultiplier lanes of the final shuffle,
8080 // starting at the appropriate offset.
8081 int *LaneMask = &Mask[i * ResMultiplier];
8082
8083 int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
8084 ExtractBase += NumElts * (Src - Sources.begin());
8085 for (int j = 0; j < LanesDefined; ++j)
8086 LaneMask[j] = ExtractBase + j;
8087 }
8088
8089
8090 // We can't handle more than two sources. This should have already
8091 // been checked before this point.
8092 assert(Sources.size() <= 2 && "Too many sources!");
8093
8095 for (unsigned i = 0; i < Sources.size(); ++i)
8096 ShuffleOps[i] = Sources[i].ShuffleVec;
8097
8099 ShuffleOps[1], Mask, DAG);
8100 if (!Shuffle)
8101 return SDValue();
8102 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Shuffle);
8103}
8104
8106 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
8115 OP_VUZPL, // VUZP, left result
8116 OP_VUZPR, // VUZP, right result
8117 OP_VZIPL, // VZIP, left result
8118 OP_VZIPR, // VZIP, right result
8119 OP_VTRNL, // VTRN, left result
8120 OP_VTRNR // VTRN, right result
8122
8123static bool isLegalMVEShuffleOp(unsigned PFEntry) {
8124 unsigned OpNum = (PFEntry >> 26) & 0x0F;
8125 switch (OpNum) {
8126 case OP_COPY:
8127 case OP_VREV:
8128 case OP_VDUP0:
8129 case OP_VDUP1:
8130 case OP_VDUP2:
8131 case OP_VDUP3:
8132 return true;
8133 }
8134 return false;
8135}
8136
8137/// isShuffleMaskLegal - Targets can use this to indicate that they only
8138/// support *some* VECTOR_SHUFFLE operations, those with specific masks.
8139/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
8140/// are assumed to be legal.
8142 if (VT.getVectorNumElements() == 4 &&
8143 (VT.is128BitVector() || VT.is64BitVector())) {
8144 unsigned PFIndexes[4];
8145 for (unsigned i = 0; i != 4; ++i) {
8146 if (M[i] < 0)
8147 PFIndexes[i] = 8;
8148 else
8149 PFIndexes[i] = M[i];
8150 }
8151
8152 // Compute the index in the perfect shuffle table.
8153 unsigned PFTableIndex =
8154 PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
8156 unsigned Cost = (PFEntry >> 30);
8157
8158 if (Cost <= 4 && (Subtarget->hasNEON() || isLegalMVEShuffleOp(PFEntry)))
8159 return true;
8160 }
8161
8162 bool ReverseVEXT, isV_UNDEF;
8163 unsigned Imm, WhichResult;
8164
8165 unsigned EltSize = VT.getScalarSizeInBits();
8166 if (EltSize >= 32 ||
8169 isVREVMask(M, VT, 64) ||
8170 isVREVMask(M, VT, 32) ||
8171 isVREVMask(M, VT, 16))
8172 return true;
8173 else if (Subtarget->hasNEON() &&
8174 (isVEXTMask(M, VT, ReverseVEXT, Imm) ||
8175 isVTBLMask(M, VT) ||
8177 return true;
8178 else if (Subtarget->hasNEON() && (VT == MVT::v8i16 || VT == MVT::v16i8) &&
8179 isReverseMask(M, VT))
8180 return true;
8181 else if (Subtarget->hasMVEIntegerOps() &&
8182 (isVMOVNMask(M, VT, true, false) ||
8183 isVMOVNMask(M, VT, false, false) || isVMOVNMask(M, VT, true, true)))
8184 return true;
8185 else
8186 return false;
8187}
8188
8189/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
8190/// the specified operations to build the shuffle.
8192 SDValue RHS, SelectionDAG &DAG,
8193 const SDLoc &dl) {
8194 unsigned OpNum = (PFEntry >> 26) & 0x0F;
8195 unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
8196 unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1);
8197
8198 if (OpNum == OP_COPY) {
8199 if (LHSID == (1*9+2)*9+3) return LHS;
8200 assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!");
8201 return RHS;
8202 }
8203
8207 EVT VT = OpLHS.getValueType();
8208
8209 switch (OpNum) {
8210 default: llvm_unreachable("Unknown shuffle opcode!");
8211 case OP_VREV:
8212 // VREV divides the vector in half and swaps within the half.
8213 if (VT.getVectorElementType() == MVT::i32 ||
8215 return DAG.getNode(ARMISD::VREV64, dl, VT, OpLHS);
8216 // vrev <4 x i16> -> VREV32
8217 if (VT.getVectorElementType() == MVT::i16 ||
8219 return DAG.getNode(ARMISD::VREV32, dl, VT, OpLHS);
8220 // vrev <4 x i8> -> VREV16
8222 return DAG.getNode(ARMISD::VREV16, dl, VT, OpLHS);
8223 case OP_VDUP0:
8224 case OP_VDUP1:
8225 case OP_VDUP2:
8226 case OP_VDUP3:
8227 return DAG.getNode(ARMISD::VDUPLANE, dl, VT,
8228 OpLHS, DAG.getConstant(OpNum-OP_VDUP0, dl, MVT::i32));
8229 case OP_VEXT1:
8230 case OP_VEXT2:
8231 case OP_VEXT3:
8232 return DAG.getNode(ARMISD::VEXT, dl, VT,
8233 OpLHS, OpRHS,
8234 DAG.getConstant(OpNum - OP_VEXT1 + 1, dl, MVT::i32));
8235 case OP_VUZPL:
8236 case OP_VUZPR:
8237 return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT),
8238 OpLHS, OpRHS).getValue(OpNum-OP_VUZPL);
8239 case OP_VZIPL:
8240 case OP_VZIPR:
8241 return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT),
8242 OpLHS, OpRHS).getValue(OpNum-OP_VZIPL);
8243 case OP_VTRNL:
8244 case OP_VTRNR:
8245 return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT),
8246 OpLHS, OpRHS).getValue(OpNum-OP_VTRNL);
8247 }
8248}
8249
8251 ArrayRef<int> ShuffleMask,
8252 SelectionDAG &DAG) {
8253 // Check to see if we can use the VTBL instruction.
8254 SDValue V1 = Op.getOperand(0);
8255 SDValue V2 = Op.getOperand(1);
8256 SDLoc DL(Op);
8257
8260 I = ShuffleMask.begin(), E = ShuffleMask.end(); I != E; ++I)
8261 VTBLMask.push_back(DAG.getConstant(*I, DL, MVT::i32));
8262
8263 if (V2.getNode()->isUndef())
8264 return DAG.getNode(ARMISD::VTBL1, DL, MVT::v8i8, V1,
8266
8267 return DAG.getNode(ARMISD::VTBL2, DL, MVT::v8i8, V1, V2,
8269}
8270
8272 SelectionDAG &DAG) {
8273 SDLoc DL(Op);
8274 SDValue OpLHS = Op.getOperand(0);
8275 EVT VT = OpLHS.getValueType();
8276
8277 assert((VT == MVT::v8i16 || VT == MVT::v16i8) &&
8278 "Expect an v8i16/v16i8 type");
8279 OpLHS = DAG.getNode(ARMISD::VREV64, DL, VT, OpLHS);
8280 // For a v16i8 type: After the VREV, we have got <8, ...15, 8, ..., 0>. Now,
8281 // extract the first 8 bytes into the top double word and the last 8 bytes
8282 // into the bottom double word. The v8i16 case is similar.
8283 unsigned ExtractNum = (VT == MVT::v16i8) ? 8 : 4;
8284 return DAG.getNode(ARMISD::VEXT, DL, VT, OpLHS, OpLHS,
8286}
8287
8289 switch (VT.getSimpleVT().SimpleTy) {
8290 case MVT::v4i1:
8291 return MVT::v4i32;
8292 case MVT::v8i1:
8293 return MVT::v8i16;
8294 case MVT::v16i1:
8295 return MVT::v16i8;
8296 default:
8297 llvm_unreachable("Unexpected vector predicate type");
8298 }
8299}
8300
8302 SelectionDAG &DAG) {
8303 // Converting from boolean predicates to integers involves creating a vector
8304 // of all ones or all zeroes and selecting the lanes based upon the real
8305 // predicate.
8306 SDValue AllOnes =
8308 AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v16i8, AllOnes);
8309
8313
8314 // Get full vector type from predicate type
8316
8318 // If the real predicate is an v8i1 or v4i1 (not v16i1) then we need to recast
8319 // this to a v16i1. This cannot be done with an ordinary bitcast because the
8320 // sizes are not the same. We have to use a MVE specific PREDICATE_CAST node,
8321 // since we know in hardware the sizes are really the same.
8322 if (VT != MVT::v16i1)
8324 else
8325 RecastV1 = Pred;
8326
8327 // Select either all ones or zeroes depending upon the real predicate bits.
8329 DAG.getNode(ISD::VSELECT, dl, MVT::v16i8, RecastV1, AllOnes, AllZeroes);
8330
8331 // Recast our new predicate-as-integer v16i8 vector into something
8332 // appropriate for the shuffle, i.e. v4i32 for a real v4i1 predicate.
8333 return DAG.getNode(ISD::BITCAST, dl, NewVT, PredAsVector);
8334}
8335
8337 const ARMSubtarget *ST) {
8338 EVT VT = Op.getValueType();
8340 ArrayRef<int> ShuffleMask = SVN->getMask();
8341
8342 assert(ST->hasMVEIntegerOps() &&
8343 "No support for vector shuffle of boolean predicates");
8344
8345 SDValue V1 = Op.getOperand(0);
8346 SDLoc dl(Op);
8347 if (isReverseMask(ShuffleMask, VT)) {
8351 DAG.getConstant(16, dl, MVT::i32));
8352 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, srl);
8353 }
8354
8355 // Until we can come up with optimised cases for every single vector
8356 // shuffle in existence we have chosen the least painful strategy. This is
8357 // to essentially promote the boolean predicate to a 8-bit integer, where
8358 // each predicate represents a byte. Then we fall back on a normal integer
8359 // vector shuffle and convert the result back into a predicate vector. In
8360 // many cases the generated code might be even better than scalar code
8361 // operating on bits. Just imagine trying to shuffle 8 arbitrary 2-bit
8362 // fields in a register into 8 other arbitrary 2-bit fields!
8364 EVT NewVT = PredAsVector.getValueType();
8365
8366 // Do the shuffle!
8368 DAG.getUNDEF(NewVT), ShuffleMask);
8369
8370 // Now return the result of comparing the shuffled vector with zero,
8371 // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1.
8372 return DAG.getNode(ARMISD::VCMPZ, dl, VT, Shuffled,
8373 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
8374}
8375
8377 ArrayRef<int> ShuffleMask,
8378 SelectionDAG &DAG) {
8379 // Attempt to lower the vector shuffle using as many whole register movs as
8380 // possible. This is useful for types smaller than 32bits, which would
8381 // often otherwise become a series for grp movs.
8382 SDLoc dl(Op);
8383 EVT VT = Op.getValueType();
8384 if (VT.getScalarSizeInBits() >= 32)
8385 return SDValue();
8386
8387 assert((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&
8388 "Unexpected vector type");
8389 int NumElts = VT.getVectorNumElements();
8390 int QuarterSize = NumElts / 4;
8391 // The four final parts of the vector, as i32's
8392 SDValue Parts[4];
8393
8394 // Look for full lane vmovs like <0,1,2,3> or <u,5,6,7> etc, (but not
8395 // <u,u,u,u>), returning the vmov lane index
8396 auto getMovIdx = [](ArrayRef<int> ShuffleMask, int Start, int Length) {
8397 // Detect which mov lane this would be from the first non-undef element.
8398 int MovIdx = -1;
8399 for (int i = 0; i < Length; i++) {
8400 if (ShuffleMask[Start + i] >= 0) {
8401 if (ShuffleMask[Start + i] % Length != i)
8402 return -1;
8403 MovIdx = ShuffleMask[Start + i] / Length;
8404 break;
8405 }
8406 }
8407 // If all items are undef, leave this for other combines
8408 if (MovIdx == -1)
8409 return -1;
8410 // Check the remaining values are the correct part of the same mov
8411 for (int i = 1; i < Length; i++) {
8412 if (ShuffleMask[Start + i] >= 0 &&
8413 (ShuffleMask[Start + i] / Length != MovIdx ||
8414 ShuffleMask[Start + i] % Length != i))
8415 return -1;
8416 }
8417 return MovIdx;
8418 };
8419
8420 for (int Part = 0; Part < 4; ++Part) {
8421 // Does this part look like a mov
8422 int Elt = getMovIdx(ShuffleMask, Part * QuarterSize, QuarterSize);
8423 if (Elt != -1) {
8424 SDValue Input = Op->getOperand(0);
8425 if (Elt >= 4) {
8426 Input = Op->getOperand(1);
8427 Elt -= 4;
8428 }
8429 SDValue BitCast = DAG.getBitcast(MVT::v4f32, Input);
8430 Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, BitCast,
8431 DAG.getConstant(Elt, dl, MVT::i32));
8432 }
8433 }
8434
8435 // Nothing interesting found, just return
8436 if (!Parts[0] && !Parts[1] && !Parts[2] && !Parts[3])
8437 return SDValue();
8438
8439 // The other parts need to be built with the old shuffle vector, cast to a
8440 // v4i32 and extract_vector_elts
8441 if (!Parts[0] || !Parts[1] || !Parts[2] || !Parts[3]) {
8443 for (int Part = 0; Part < 4; ++Part)
8444 for (int i = 0; i < QuarterSize; i++)
8445 NewShuffleMask.push_back(
8446 Parts[Part] ? -1 : ShuffleMask[Part * QuarterSize + i]);
8448 VT, dl, Op->getOperand(0), Op->getOperand(1), NewShuffleMask);
8450
8451 for (int Part = 0; Part < 4; ++Part)
8452 if (!Parts[Part])
8453 Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32,
8454 BitCast, DAG.getConstant(Part, dl, MVT::i32));
8455 }
8456 // Build a vector out of the various parts and bitcast it back to the original
8457 // type.
8459 return DAG.getBitcast(VT, NewVec);
8460}
8461
8463 ArrayRef<int> ShuffleMask,
8464 SelectionDAG &DAG) {
8465 SDValue V1 = Op.getOperand(0);
8466 SDValue V2 = Op.getOperand(1);
8467 EVT VT = Op.getValueType();
8468 unsigned NumElts = VT.getVectorNumElements();
8469
8470 // An One-Off Identity mask is one that is mostly an identity mask from as
8471 // single source but contains a single element out-of-place, either from a
8472 // different vector or from another position in the same vector. As opposed to
8473 // lowering this via a ARMISD::BUILD_VECTOR we can generate an extract/insert
8474 // pair directly.
8475 auto isOneOffIdentityMask = [](ArrayRef<int> Mask, EVT VT, int BaseOffset,
8476 int &OffElement) {
8477 OffElement = -1;
8478 int NonUndef = 0;
8479 for (int i = 0, NumMaskElts = Mask.size(); i < NumMaskElts; ++i) {
8480 if (Mask[i] == -1)
8481 continue;
8482 NonUndef++;
8483 if (Mask[i] != i + BaseOffset) {
8484 if (OffElement == -1)
8485 OffElement = i;
8486 else
8487 return false;
8488 }
8489 }
8490 return NonUndef > 2 && OffElement != -1;
8491 };
8492 int OffElement;
8494 if (isOneOffIdentityMask(ShuffleMask, VT, 0, OffElement))
8495 VInput = V1;
8496 else if (isOneOffIdentityMask(ShuffleMask, VT, NumElts, OffElement))
8497 VInput = V2;
8498 else
8499 return SDValue();
8500
8501 SDLoc dl(Op);
8502 EVT SVT = VT.getScalarType() == MVT::i8 || VT.getScalarType() == MVT::i16
8503 ? MVT::i32
8504 : VT.getScalarType();
8505 SDValue Elt = DAG.getNode(
8507 ShuffleMask[OffElement] < (int)NumElts ? V1 : V2,
8508 DAG.getVectorIdxConstant(ShuffleMask[OffElement] % NumElts, dl));
8509 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, VInput, Elt,
8511}
8512
8514 const ARMSubtarget *ST) {
8515 SDValue V1 = Op.getOperand(0);
8516 SDValue V2 = Op.getOperand(1);
8517 SDLoc dl(Op);
8518 EVT VT = Op.getValueType();
8520 unsigned EltSize = VT.getScalarSizeInBits();
8521
8522 if (ST->hasMVEIntegerOps() && EltSize == 1)
8523 return LowerVECTOR_SHUFFLE_i1(Op, DAG, ST);
8524
8525 // Convert shuffles that are directly supported on NEON to target-specific
8526 // DAG nodes, instead of keeping them as shuffles and matching them again
8527 // during code selection. This is more efficient and avoids the possibility
8528 // of inconsistencies between legalization and selection.
8529 // FIXME: floating-point vectors should be canonicalized to integer vectors
8530 // of the same time so that they get CSEd properly.
8531 ArrayRef<int> ShuffleMask = SVN->getMask();
8532
8533 if (EltSize <= 32) {
8534 if (SVN->isSplat()) {
8535 int Lane = SVN->getSplatIndex();
8536 // If this is undef splat, generate it via "just" vdup, if possible.
8537 if (Lane == -1) Lane = 0;
8538
8539 // Test if V1 is a SCALAR_TO_VECTOR.
8540 if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) {
8541 return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));
8542 }
8543 // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR
8544 // (and probably will turn into a SCALAR_TO_VECTOR once legalization
8545 // reaches it).
8546 if (Lane == 0 && V1.getOpcode() == ISD::BUILD_VECTOR &&
8547 !isa<ConstantSDNode>(V1.getOperand(0))) {
8548 bool IsScalarToVector = true;
8549 for (unsigned i = 1, e = V1.getNumOperands(); i != e; ++i)
8550 if (!V1.getOperand(i).isUndef()) {
8551 IsScalarToVector = false;
8552 break;
8553 }
8554 if (IsScalarToVector)
8555 return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));
8556 }
8557 return DAG.getNode(ARMISD::VDUPLANE, dl, VT, V1,
8558 DAG.getConstant(Lane, dl, MVT::i32));
8559 }
8560
8561 bool ReverseVEXT = false;
8562 unsigned Imm = 0;
8563 if (ST->hasNEON() && isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) {
8564 if (ReverseVEXT)
8565 std::swap(V1, V2);
8566 return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2,
8567 DAG.getConstant(Imm, dl, MVT::i32));
8568 }
8569
8570 if (isVREVMask(ShuffleMask, VT, 64))
8571 return DAG.getNode(ARMISD::VREV64, dl, VT, V1);
8572 if (isVREVMask(ShuffleMask, VT, 32))
8573 return DAG.getNode(ARMISD::VREV32, dl, VT, V1);
8574 if (isVREVMask(ShuffleMask, VT, 16))
8575 return DAG.getNode(ARMISD::VREV16, dl, VT, V1);
8576
8577 if (ST->hasNEON() && V2->isUndef() && isSingletonVEXTMask(ShuffleMask, VT, Imm)) {
8578 return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V1,
8579 DAG.getConstant(Imm, dl, MVT::i32));
8580 }
8581
8582 // Check for Neon shuffles that modify both input vectors in place.
8583 // If both results are used, i.e., if there are two shuffles with the same
8584 // source operands and with masks corresponding to both results of one of
8585 // these operations, DAG memoization will ensure that a single node is
8586 // used for both shuffles.
8587 unsigned WhichResult = 0;
8588 bool isV_UNDEF = false;
8589 if (ST->hasNEON()) {
8591 ShuffleMask, VT, WhichResult, isV_UNDEF)) {
8592 if (isV_UNDEF)
8593 V2 = V1;
8594 return DAG.getNode(ShuffleOpc, dl, DAG.getVTList(VT, VT), V1, V2)
8596 }
8597 }
8598 if (ST->hasMVEIntegerOps()) {
8599 if (isVMOVNMask(ShuffleMask, VT, false, false))
8600 return DAG.getNode(ARMISD::VMOVN, dl, VT, V2, V1,
8601 DAG.getConstant(0, dl, MVT::i32));
8602 if (isVMOVNMask(ShuffleMask, VT, true, false))
8603 return DAG.getNode(ARMISD::VMOVN, dl, VT, V1, V2,
8604 DAG.getConstant(1, dl, MVT::i32));
8605 if (isVMOVNMask(ShuffleMask, VT, true, true))
8606 return DAG.getNode(ARMISD::VMOVN, dl, VT, V1, V1,
8607 DAG.getConstant(1, dl, MVT::i32));
8608 }
8609
8610 // Also check for these shuffles through CONCAT_VECTORS: we canonicalize
8611 // shuffles that produce a result larger than their operands with:
8612 // shuffle(concat(v1, undef), concat(v2, undef))
8613 // ->
8614 // shuffle(concat(v1, v2), undef)
8615 // because we can access quad vectors (see PerformVECTOR_SHUFFLECombine).
8616 //
8617 // This is useful in the general case, but there are special cases where
8618 // native shuffles produce larger results: the two-result ops.
8619 //
8620 // Look through the concat when lowering them:
8621 // shuffle(concat(v1, v2), undef)
8622 // ->
8623 // concat(VZIP(v1, v2):0, :1)
8624 //
8625 if (ST->hasNEON() && V1->getOpcode() == ISD::CONCAT_VECTORS && V2->isUndef()) {
8626 SDValue SubV1 = V1->getOperand(0);
8627 SDValue SubV2 = V1->getOperand(1);
8628 EVT SubVT = SubV1.getValueType();
8629
8630 // We expect these to have been canonicalized to -1.
8631 assert(llvm::all_of(ShuffleMask, [&](int i) {
8632 return i < (int)VT.getVectorNumElements();
8633 }) && "Unexpected shuffle index into UNDEF operand!");
8634
8636 ShuffleMask, SubVT, WhichResult, isV_UNDEF)) {
8637 if (isV_UNDEF)
8638 SubV2 = SubV1;
8639 assert((WhichResult == 0) &&
8640 "In-place shuffle of concat can only have one result!");
8641 SDValue Res = DAG.getNode(ShuffleOpc, dl, DAG.getVTList(SubVT, SubVT),
8642 SubV1, SubV2);
8643 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Res.getValue(0),
8644 Res.getValue(1));
8645 }
8646 }
8647 }
8648
8649 if (ST->hasMVEIntegerOps() && EltSize <= 32)
8650 if (SDValue V = LowerVECTOR_SHUFFLEUsingOneOff(Op, ShuffleMask, DAG))
8651 return V;
8652
8653 // If the shuffle is not directly supported and it has 4 elements, use
8654 // the PerfectShuffle-generated table to synthesize it from other shuffles.
8655 unsigned NumElts = VT.getVectorNumElements();
8656 if (NumElts == 4) {
8657 unsigned PFIndexes[4];
8658 for (unsigned i = 0; i != 4; ++i) {
8659 if (ShuffleMask[i] < 0)
8660 PFIndexes[i] = 8;
8661 else
8662 PFIndexes[i] = ShuffleMask[i];
8663 }
8664
8665 // Compute the index in the perfect shuffle table.
8666 unsigned PFTableIndex =
8667 PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
8669 unsigned Cost = (PFEntry >> 30);
8670
8671 if (Cost <= 4) {
8672 if (ST->hasNEON())
8673 return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
8674 else if (isLegalMVEShuffleOp(PFEntry)) {
8675 unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
8676 unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1);
8680 return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
8681 }
8682 }
8683 }
8684
8685 // Implement shuffles with 32- or 64-bit elements as ARMISD::BUILD_VECTORs.
8686 if (EltSize >= 32) {
8687 // Do the expansion with floating-point types, since that is what the VFP
8688 // registers are defined to use, and since i64 is not legal.
8690 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);
8691 V1 = DAG.getNode(ISD::BITCAST, dl, VecVT, V1);
8692 V2 = DAG.getNode(ISD::BITCAST, dl, VecVT, V2);
8694 for (unsigned i = 0; i < NumElts; ++i) {
8695 if (ShuffleMask[i] < 0)
8696 Ops.push_back(DAG.getUNDEF(EltVT));
8697 else
8698 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
8699 ShuffleMask[i] < (int)NumElts ? V1 : V2,
8700 DAG.getConstant(ShuffleMask[i] & (NumElts-1),
8701 dl, MVT::i32)));
8702 }
8703 SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops);
8704 return DAG.getNode(ISD::BITCAST, dl, VT, Val);
8705 }
8706
8707 if (ST->hasNEON() && (VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(ShuffleMask, VT))
8709
8710 if (ST->hasNEON() && VT == MVT::v8i8)
8711 if (SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG))
8712 return NewOp;
8713
8714 if (ST->hasMVEIntegerOps())
8715 if (SDValue NewOp = LowerVECTOR_SHUFFLEUsingMovs(Op, ShuffleMask, DAG))
8716 return NewOp;
8717
8718 return SDValue();
8719}
8720
8722 const ARMSubtarget *ST) {
8723 EVT VecVT = Op.getOperand(0).getValueType();
8724 SDLoc dl(Op);
8725
8726 assert(ST->hasMVEIntegerOps() &&
8727 "LowerINSERT_VECTOR_ELT_i1 called without MVE!");
8728
8729 SDValue Conv =
8730 DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Op->getOperand(0));
8731 unsigned Lane = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
8732 unsigned LaneWidth =
8733 getVectorTyFromPredicateVector(VecVT).getScalarSizeInBits() / 8;
8734 unsigned Mask = ((1 << LaneWidth) - 1) << Lane * LaneWidth;
8736 Op.getOperand(1), DAG.getValueType(MVT::i1));
8737 SDValue BFI = DAG.getNode(ARMISD::BFI, dl, MVT::i32, Conv, Ext,
8738 DAG.getConstant(~Mask, dl, MVT::i32));
8739 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, Op.getValueType(), BFI);
8740}
8741
8742SDValue ARMTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
8743 SelectionDAG &DAG) const {
8744 // INSERT_VECTOR_ELT is legal only for immediate indexes.
8745 SDValue Lane = Op.getOperand(2);
8746 if (!isa<ConstantSDNode>(Lane))
8747 return SDValue();
8748
8749 SDValue Elt = Op.getOperand(1);
8750 EVT EltVT = Elt.getValueType();
8751
8752 if (Subtarget->hasMVEIntegerOps() &&
8753 Op.getValueType().getScalarSizeInBits() == 1)
8754 return LowerINSERT_VECTOR_ELT_i1(Op, DAG, Subtarget);
8755
8756 if (getTypeAction(*DAG.getContext(), EltVT) ==
8758 // INSERT_VECTOR_ELT doesn't want f16 operands promoting to f32,
8759 // but the type system will try to do that if we don't intervene.
8760 // Reinterpret any such vector-element insertion as one with the
8761 // corresponding integer types.
8762
8763 SDLoc dl(Op);
8764
8765 EVT IEltVT = MVT::getIntegerVT(EltVT.getScalarSizeInBits());
8768
8769 SDValue VecIn = Op.getOperand(0);
8770 EVT VecVT = VecIn.getValueType();
8772 VecVT.getVectorNumElements());
8773
8777 IVecIn, IElt, Lane);
8778 return DAG.getNode(ISD::BITCAST, dl, VecVT, IVecOut);
8779 }
8780
8781 return Op;
8782}
8783
8785 const ARMSubtarget *ST) {
8786 EVT VecVT = Op.getOperand(0).getValueType();
8787 SDLoc dl(Op);
8788
8789 assert(ST->hasMVEIntegerOps() &&
8790 "LowerINSERT_VECTOR_ELT_i1 called without MVE!");
8791
8792 SDValue Conv =
8793 DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Op->getOperand(0));
8794 unsigned Lane = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
8795 unsigned LaneWidth =
8796 getVectorTyFromPredicateVector(VecVT).getScalarSizeInBits() / 8;
8797 SDValue Shift = DAG.getNode(ISD::SRL, dl, MVT::i32, Conv,
8798 DAG.getConstant(Lane * LaneWidth, dl, MVT::i32));
8799 return Shift;
8800}
8801
8803 const ARMSubtarget *ST) {
8804 // EXTRACT_VECTOR_ELT is legal only for immediate indexes.
8805 SDValue Lane = Op.getOperand(1);
8806 if (!isa<ConstantSDNode>(Lane))
8807 return SDValue();
8808
8809 SDValue Vec = Op.getOperand(0);
8810 EVT VT = Vec.getValueType();
8811
8812 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1)
8813 return LowerEXTRACT_VECTOR_ELT_i1(Op, DAG, ST);
8814
8815 if (Op.getValueType() == MVT::i32 && Vec.getScalarValueSizeInBits() < 32) {
8816 SDLoc dl(Op);
8817 return DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane);
8818 }
8819
8820 return Op;
8821}
8822
8824 const ARMSubtarget *ST) {
8825 SDValue V1 = Op.getOperand(0);
8826 SDValue V2 = Op.getOperand(1);
8827 SDLoc dl(Op);
8828 EVT VT = Op.getValueType();
8829 EVT Op1VT = V1.getValueType();
8830 EVT Op2VT = V2.getValueType();
8831 unsigned NumElts = VT.getVectorNumElements();
8832
8833 assert(Op1VT == Op2VT && "Operand types don't match!");
8834 assert(VT.getScalarSizeInBits() == 1 &&
8835 "Unexpected custom CONCAT_VECTORS lowering");
8836 assert(ST->hasMVEIntegerOps() &&
8837 "CONCAT_VECTORS lowering only supported for MVE");
8838
8840 SDValue NewV2 = PromoteMVEPredVector(dl, V2, Op2VT, DAG);
8841
8842 // We now have Op1 + Op2 promoted to vectors of integers, where v8i1 gets
8843 // promoted to v8i16, etc.
8844
8845 MVT ElType = getVectorTyFromPredicateVector(VT).getScalarType().getSimpleVT();
8846
8847 // Extract the vector elements from Op1 and Op2 one by one and truncate them
8848 // to be the right size for the destination. For example, if Op1 is v4i1 then
8849 // the promoted vector is v4i32. The result of concatentation gives a v8i1,
8850 // which when promoted is v8i16. That means each i32 element from Op1 needs
8851 // truncating to i16 and inserting in the result.
8854 auto ExractInto = [&DAG, &dl](SDValue NewV, SDValue ConVec, unsigned &j) {
8855 EVT NewVT = NewV.getValueType();
8856 EVT ConcatVT = ConVec.getValueType();
8857 for (unsigned i = 0, e = NewVT.getVectorNumElements(); i < e; i++, j++) {
8859 DAG.getIntPtrConstant(i, dl));
8861 DAG.getConstant(j, dl, MVT::i32));
8862 }
8863 return ConVec;
8864 };
8865 unsigned j = 0;
8868
8869 // Now return the result of comparing the subvector with zero,
8870 // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1.
8871 return DAG.getNode(ARMISD::VCMPZ, dl, VT, ConVec,
8872 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
8873}
8874
8876 const ARMSubtarget *ST) {
8877 EVT VT = Op->getValueType(0);
8878 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1)
8879 return LowerCONCAT_VECTORS_i1(Op, DAG, ST);
8880
8881 // The only time a CONCAT_VECTORS operation can have legal types is when
8882 // two 64-bit vectors are concatenated to a 128-bit vector.
8883 assert(Op.getValueType().is128BitVector() && Op.getNumOperands() == 2 &&
8884 "unexpected CONCAT_VECTORS");
8885 SDLoc dl(Op);
8886 SDValue Val = DAG.getUNDEF(MVT::v2f64);
8887 SDValue Op0 = Op.getOperand(0);
8888 SDValue Op1 = Op.getOperand(1);
8889 if (!Op0.isUndef())
8890 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
8891 DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op0),
8892 DAG.getIntPtrConstant(0, dl));
8893 if (!Op1.isUndef())
8894 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
8895 DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op1),
8896 DAG.getIntPtrConstant(1, dl));
8897 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Val);
8898}
8899
8901 const ARMSubtarget *ST) {
8902 SDValue V1 = Op.getOperand(0);
8903 SDValue V2 = Op.getOperand(1);
8904 SDLoc dl(Op);
8905 EVT VT = Op.getValueType();
8906 EVT Op1VT = V1.getValueType();
8907 unsigned NumElts = VT.getVectorNumElements();
8908 unsigned Index = cast<ConstantSDNode>(V2)->getZExtValue();
8909
8910 assert(VT.getScalarSizeInBits() == 1 &&
8911 "Unexpected custom EXTRACT_SUBVECTOR lowering");
8912 assert(ST->hasMVEIntegerOps() &&
8913 "EXTRACT_SUBVECTOR lowering only supported for MVE");
8914
8916
8917 // We now have Op1 promoted to a vector of integers, where v8i1 gets
8918 // promoted to v8i16, etc.
8919
8920 MVT ElType = getVectorTyFromPredicateVector(VT).getScalarType().getSimpleVT();
8921
8924 for (unsigned i = Index, j = 0; i < (Index + NumElts); i++, j++) {
8926 DAG.getIntPtrConstant(i, dl));
8928 DAG.getConstant(j, dl, MVT::i32));
8929 }
8930
8931 // Now return the result of comparing the subvector with zero,
8932 // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1.
8933 return DAG.getNode(ARMISD::VCMPZ, dl, VT, SubVec,
8934 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
8935}
8936
8937// Turn a truncate into a predicate (an i1 vector) into icmp(and(x, 1), 0).
8939 const ARMSubtarget *ST) {
8940 assert(ST->hasMVEIntegerOps() && "Expected MVE!");
8941 EVT VT = N->getValueType(0);
8942 assert((VT == MVT::v16i1 || VT == MVT::v8i1 || VT == MVT::v4i1) &&
8943 "Expected a vector i1 type!");
8944 SDValue Op = N->getOperand(0);
8945 EVT FromVT = Op.getValueType();
8946 SDLoc DL(N);
8947
8948 SDValue And =
8949 DAG.getNode(ISD::AND, DL, FromVT, Op, DAG.getConstant(1, DL, FromVT));
8950 return DAG.getNode(ISD::SETCC, DL, VT, And, DAG.getConstant(0, DL, FromVT),
8951 DAG.getCondCode(ISD::SETNE));
8952}
8953
8955 const ARMSubtarget *Subtarget) {
8956 if (!Subtarget->hasMVEIntegerOps())
8957 return SDValue();
8958
8959 EVT ToVT = N->getValueType(0);
8960 if (ToVT.getScalarType() == MVT::i1)
8961 return LowerTruncatei1(N, DAG, Subtarget);
8962
8963 // MVE does not have a single instruction to perform the truncation of a v4i32
8964 // into the lower half of a v8i16, in the same way that a NEON vmovn would.
8965 // Most of the instructions in MVE follow the 'Beats' system, where moving
8966 // values from different lanes is usually something that the instructions
8967 // avoid.
8968 //
8969 // Instead it has top/bottom instructions such as VMOVLT/B and VMOVNT/B,
8970 // which take a the top/bottom half of a larger lane and extend it (or do the
8971 // opposite, truncating into the top/bottom lane from a larger lane). Note
8972 // that because of the way we widen lanes, a v4i16 is really a v4i32 using the
8973 // bottom 16bits from each vector lane. This works really well with T/B
8974 // instructions, but that doesn't extend to v8i32->v8i16 where the lanes need
8975 // to move order.
8976 //
8977 // But truncates and sext/zext are always going to be fairly common from llvm.
8978 // We have several options for how to deal with them:
8979 // - Wherever possible combine them into an instruction that makes them
8980 // "free". This includes loads/stores, which can perform the trunc as part
8981 // of the memory operation. Or certain shuffles that can be turned into
8982 // VMOVN/VMOVL.
8983 // - Lane Interleaving to transform blocks surrounded by ext/trunc. So
8984 // trunc(mul(sext(a), sext(b))) may become
8985 // VMOVNT(VMUL(VMOVLB(a), VMOVLB(b)), VMUL(VMOVLT(a), VMOVLT(b))). (Which in
8986 // this case can use VMULL). This is performed in the
8987 // MVELaneInterleavingPass.
8988 // - Otherwise we have an option. By default we would expand the
8989 // zext/sext/trunc into a series of lane extract/inserts going via GPR
8990 // registers. One for each vector lane in the vector. This can obviously be
8991 // very expensive.
8992 // - The other option is to use the fact that loads/store can extend/truncate
8993 // to turn a trunc into two truncating stack stores and a stack reload. This
8994 // becomes 3 back-to-back memory operations, but at least that is less than
8995 // all the insert/extracts.
8996 //
8997 // In order to do the last, we convert certain trunc's into MVETRUNC, which
8998 // are either optimized where they can be, or eventually lowered into stack
8999 // stores/loads. This prevents us from splitting a v8i16 trunc into two stores
9000 // two early, where other instructions would be better, and stops us from
9001 // having to reconstruct multiple buildvector shuffles into loads/stores.
9002 if (ToVT != MVT::v8i16 && ToVT != MVT::v16i8)
9003 return SDValue();
9004 EVT FromVT = N->getOperand(0).getValueType();
9005 if (FromVT != MVT::v8i32 && FromVT != MVT::v16i16)
9006 return SDValue();
9007
9008 SDValue Lo, Hi;
9009 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
9010 SDLoc DL(N);
9011 return DAG.getNode(ARMISD::MVETRUNC, DL, ToVT, Lo, Hi);
9012}
9013
9015 const ARMSubtarget *Subtarget) {
9016 if (!Subtarget->hasMVEIntegerOps())
9017 return SDValue();
9018
9019 // See LowerTruncate above for an explanation of MVEEXT/MVETRUNC.
9020
9021 EVT ToVT = N->getValueType(0);
9022 if (ToVT != MVT::v16i32 && ToVT != MVT::v8i32 && ToVT != MVT::v16i16)
9023 return SDValue();
9024 SDValue Op = N->getOperand(0);
9025 EVT FromVT = Op.getValueType();
9026 if (FromVT != MVT::v8i16 && FromVT != MVT::v16i8)
9027 return SDValue();
9028
9029 SDLoc DL(N);
9030 EVT ExtVT = ToVT.getHalfNumVectorElementsVT(*DAG.getContext());
9031 if (ToVT.getScalarType() == MVT::i32 && FromVT.getScalarType() == MVT::i8)
9032 ExtVT = MVT::v8i16;
9033
9034 unsigned Opcode =
9036 SDValue Ext = DAG.getNode(Opcode, DL, DAG.getVTList(ExtVT, ExtVT), Op);
9037 SDValue Ext1 = Ext.getValue(1);
9038
9039 if (ToVT.getScalarType() == MVT::i32 && FromVT.getScalarType() == MVT::i8) {
9040 Ext = DAG.getNode(N->getOpcode(), DL, MVT::v8i32, Ext);
9041 Ext1 = DAG.getNode(N->getOpcode(), DL, MVT::v8i32, Ext1);
9042 }
9043
9044 return DAG.getNode(ISD::CONCAT_VECTORS, DL, ToVT, Ext, Ext1);
9045}
9046
9047/// isExtendedBUILD_VECTOR - Check if N is a constant BUILD_VECTOR where each
9048/// element has been zero/sign-extended, depending on the isSigned parameter,
9049/// from an integer type half its size.
9051 bool isSigned) {
9052 // A v2i64 BUILD_VECTOR will have been legalized to a BITCAST from v4i32.
9053 EVT VT = N->getValueType(0);
9054 if (VT == MVT::v2i64 && N->getOpcode() == ISD::BITCAST) {
9055 SDNode *BVN = N->getOperand(0).getNode();
9056 if (BVN->getValueType(0) != MVT::v4i32 ||
9057 BVN->getOpcode() != ISD::BUILD_VECTOR)
9058 return false;
9059 unsigned LoElt = DAG.getDataLayout().isBigEndian() ? 1 : 0;
9060 unsigned HiElt = 1 - LoElt;
9065 if (!Lo0 || !Hi0 || !Lo1 || !Hi1)
9066 return false;
9067 if (isSigned) {
9068 if (Hi0->getSExtValue() == Lo0->getSExtValue() >> 32 &&
9069 Hi1->getSExtValue() == Lo1->getSExtValue() >> 32)
9070 return true;
9071 } else {
9072 if (Hi0->isNullValue() && Hi1->isNullValue())
9073 return true;
9074 }
9075 return false;
9076 }
9077
9078 if (N->getOpcode() != ISD::BUILD_VECTOR)
9079 return false;
9080
9081 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
9082 SDNode *Elt = N->getOperand(i).getNode();
9084 unsigned EltSize = VT.getScalarSizeInBits();
9085 unsigned HalfSize = EltSize / 2;
9086 if (isSigned) {
9087 if (!isIntN(HalfSize, C->getSExtValue()))
9088 return false;
9089 } else {
9090 if (!isUIntN(HalfSize, C->getZExtValue()))
9091 return false;
9092 }
9093 continue;
9094 }
9095 return false;
9096 }
9097
9098 return true;
9099}
9100
9101/// isSignExtended - Check if a node is a vector value that is sign-extended
9102/// or a constant BUILD_VECTOR with sign-extended elements.
9104 if (N->getOpcode() == ISD::SIGN_EXTEND || ISD::isSEXTLoad(N))
9105 return true;
9106 if (isExtendedBUILD_VECTOR(N, DAG, true))
9107 return true;
9108 return false;
9109}
9110
9111/// isZeroExtended - Check if a node is a vector value that is zero-extended (or
9112/// any-extended) or a constant BUILD_VECTOR with zero-extended elements.
9114 if (N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::ANY_EXTEND ||
9116 return true;
9117 if (isExtendedBUILD_VECTOR(N, DAG, false))
9118 return true;
9119 return false;
9120}
9121
9123 if (OrigVT.getSizeInBits() >= 64)
9124 return OrigVT;
9125
9126 assert(OrigVT.isSimple() && "Expecting a simple value type");
9127
9128 MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
9129 switch (OrigSimpleTy) {
9130 default: llvm_unreachable("Unexpected Vector Type");
9131 case MVT::v2i8:
9132 case MVT::v2i16:
9133 return MVT::v2i32;
9134 case MVT::v4i8:
9135 return MVT::v4i16;
9136 }
9137}
9138
9139/// AddRequiredExtensionForVMULL - Add a sign/zero extension to extend the total
9140/// value size to 64 bits. We need a 64-bit D register as an operand to VMULL.
9141/// We insert the required extension here to get the vector to fill a D register.
9143 const EVT &OrigTy,
9144 const EVT &ExtTy,
9145 unsigned ExtOpcode) {
9146 // The vector originally had a size of OrigTy. It was then extended to ExtTy.
9147 // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
9148 // 64-bits we need to insert a new extension so that it will be 64-bits.
9149 assert(ExtTy.is128BitVector() && "Unexpected extension size");
9150 if (OrigTy.getSizeInBits() >= 64)
9151 return N;
9152
9153 // Must extend size to at least 64 bits to be used as an operand for VMULL.
9154 EVT NewVT = getExtensionTo64Bits(OrigTy);
9155
9156 return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
9157}
9158
9159/// SkipLoadExtensionForVMULL - return a load of the original vector size that
9160/// does not do any sign/zero extension. If the original vector is less
9161/// than 64 bits, an appropriate extension will be added after the load to
9162/// reach a total size of 64 bits. We have to add the extension separately
9163/// because ARM does not have a sign/zero extending load for vectors.
9165 EVT ExtendedTy = getExtensionTo64Bits(LD->getMemoryVT());
9166
9167 // The load already has the right type.
9168 if (ExtendedTy == LD->getMemoryVT())
9169 return DAG.getLoad(LD->getMemoryVT(), SDLoc(LD), LD->getChain(),
9170 LD->getBasePtr(), LD->getPointerInfo(),
9171 LD->getAlignment(), LD->getMemOperand()->getFlags());
9172
9173 // We need to create a zextload/sextload. We cannot just create a load
9174 // followed by a zext/zext node because LowerMUL is also run during normal
9175 // operation legalization where we can't create illegal types.
9176 return DAG.getExtLoad(LD->getExtensionType(), SDLoc(LD), ExtendedTy,
9177 LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(),
9178 LD->getMemoryVT(), LD->getAlignment(),
9179 LD->getMemOperand()->getFlags());
9180}
9181
9182/// SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND,
9183/// ANY_EXTEND, extending load, or BUILD_VECTOR with extended elements, return
9184/// the unextended value. The unextended vector should be 64 bits so that it can
9185/// be used as an operand to a VMULL instruction. If the original vector size
9186/// before extension is less than 64 bits we add a an extension to resize
9187/// the vector to 64 bits.
9189 if (N->getOpcode() == ISD::SIGN_EXTEND ||
9190 N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::ANY_EXTEND)
9191 return AddRequiredExtensionForVMULL(N->getOperand(0), DAG,
9192 N->getOperand(0)->getValueType(0),
9193 N->getValueType(0),
9194 N->getOpcode());
9195
9196 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
9197 assert((ISD::isSEXTLoad(LD) || ISD::isZEXTLoad(LD)) &&
9198 "Expected extending load");
9199
9201 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), newLoad.getValue(1));
9202 unsigned Opcode = ISD::isSEXTLoad(LD) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
9204 DAG.getNode(Opcode, SDLoc(newLoad), LD->getValueType(0), newLoad);
9206
9207 return newLoad;
9208 }
9209
9210 // Otherwise, the value must be a BUILD_VECTOR. For v2i64, it will
9211 // have been legalized as a BITCAST from v4i32.
9212 if (N->getOpcode() == ISD::BITCAST) {
9213 SDNode *BVN = N->getOperand(0).getNode();
9214 assert(BVN->getOpcode() == ISD::BUILD_VECTOR &&
9215 BVN->getValueType(0) == MVT::v4i32 && "expected v4i32 BUILD_VECTOR");
9216 unsigned LowElt = DAG.getDataLayout().isBigEndian() ? 1 : 0;
9217 return DAG.getBuildVector(
9218 MVT::v2i32, SDLoc(N),
9219 {BVN->getOperand(LowElt), BVN->getOperand(LowElt + 2)});
9220 }
9221 // Construct a new BUILD_VECTOR with elements truncated to half the size.
9222 assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
9223 EVT VT = N->getValueType(0);
9224 unsigned EltSize = VT.getScalarSizeInBits() / 2;
9225 unsigned NumElts = VT.getVectorNumElements();
9226 MVT TruncVT = MVT::getIntegerVT(EltSize);
9228 SDLoc dl(N);
9229 for (unsigned i = 0; i != NumElts; ++i) {
9230 ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i));
9231 const APInt &CInt = C->getAPIntValue();
9232 // Element types smaller than 32 bits are not legal, so use i32 elements.
9233 // The values are implicitly truncated so sext vs. zext doesn't matter.
9234 Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
9235 }
9236 return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops);
9237}
9238
9239static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
9240 unsigned Opcode = N->getOpcode();
9241 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
9242 SDNode *N0 = N->getOperand(0).getNode();
9243 SDNode *N1 = N->getOperand(1).getNode();
9244 return N0->hasOneUse() && N1->hasOneUse() &&
9245 isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
9246 }
9247 return false;
9248}
9249
9250static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
9251 unsigned Opcode = N->getOpcode();
9252 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
9253 SDNode *N0 = N->getOperand(0).getNode();
9254 SDNode *N1 = N->getOperand(1).getNode();
9255 return N0->hasOneUse() && N1->hasOneUse() &&
9256 isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
9257 }
9258 return false;
9259}
9260
9262 // Multiplications are only custom-lowered for 128-bit vectors so that
9263 // VMULL can be detected. Otherwise v2i64 multiplications are not legal.
9264 EVT VT = Op.getValueType();
9265 assert(VT.is128BitVector() && VT.isInteger() &&
9266 "unexpected type for custom-lowering ISD::MUL");
9267 SDNode *N0 = Op.getOperand(0).getNode();
9268 SDNode *N1 = Op.getOperand(1).getNode();
9269 unsigned NewOpc = 0;
9270 bool isMLA = false;
9271 bool isN0SExt = isSignExtended(N0, DAG);
9272 bool isN1SExt = isSignExtended(N1, DAG);
9273 if (isN0SExt && isN1SExt)
9275 else {
9276 bool isN0ZExt = isZeroExtended(N0, DAG);
9277 bool isN1ZExt = isZeroExtended(N1, DAG);
9278 if (isN0ZExt && isN1ZExt)
9280 else if (isN1SExt || isN1ZExt) {
9281 // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
9282 // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
9283 if (isN1SExt && isAddSubSExt(N0, DAG)) {
9285 isMLA = true;
9286 } else if (isN1ZExt && isAddSubZExt(N0, DAG)) {
9288 isMLA = true;
9289 } else if (isN0ZExt && isAddSubZExt(N1, DAG)) {
9290 std::swap(N0, N1);
9292 isMLA = true;
9293 }
9294 }
9295
9296 if (!NewOpc) {
9297 if (VT == MVT::v2i64)
9298 // Fall through to expand this. It is not legal.
9299 return SDValue();
9300 else
9301 // Other vector multiplications are legal.
9302 return Op;
9303 }
9304 }
9305
9306 // Legalize to a VMULL instruction.
9307 SDLoc DL(Op);
9308 SDValue Op0;
9309 SDValue Op1 = SkipExtensionForVMULL(N1, DAG);
9310 if (!isMLA) {
9311 Op0 = SkipExtensionForVMULL(N0, DAG);
9313 Op1.getValueType().is64BitVector() &&
9314 "unexpected types for extended operands to VMULL");
9315 return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
9316 }
9317
9318 // Optimizing (zext A + zext B) * C, to (VMULL A, C) + (VMULL B, C) during
9319 // isel lowering to take advantage of no-stall back to back vmul + vmla.
9320 // vmull q0, d4, d6
9321 // vmlal q0, d5, d6
9322 // is faster than
9323 // vaddl q0, d4, d5
9324 // vmovl q1, d6
9325 // vmul q0, q0, q1
9328 EVT Op1VT = Op1.getValueType();
9329 return DAG.getNode(N0->getOpcode(), DL, VT,
9330 DAG.getNode(NewOpc, DL, VT,
9331 DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
9332 DAG.getNode(NewOpc, DL, VT,
9333 DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
9334}
9335
9337 SelectionDAG &DAG) {
9338 // TODO: Should this propagate fast-math-flags?
9339
9340 // Convert to float
9341 // float4 xf = vcvt_f32_s32(vmovl_s16(a.lo));
9342 // float4 yf = vcvt_f32_s32(vmovl_s16(b.lo));
9343 X = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, X);
9344 Y = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Y);
9345 X = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, X);
9346 Y = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, Y);
9347 // Get reciprocal estimate.
9348 // float4 recip = vrecpeq_f32(yf);
9350 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
9351 Y);
9352 // Because char has a smaller range than uchar, we can actually get away
9353 // without any newton steps. This requires that we use a weird bias
9354 // of 0xb000, however (again, this has been exhaustively tested).
9355 // float4 result = as_float4(as_int4(xf*recip) + 0xb000);
9356 X = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, X, Y);
9357 X = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, X);
9358 Y = DAG.getConstant(0xb000, dl, MVT::v4i32);
9359 X = DAG.getNode(ISD::ADD, dl, MVT::v4i32, X, Y);
9360 X = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, X);
9361 // Convert back to short.
9362 X = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, X);
9363 X = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, X);
9364 return X;
9365}
9366
9368 SelectionDAG &DAG) {
9369 // TODO: Should this propagate fast-math-flags?
9370
9371 SDValue N2;
9372 // Convert to float.
9373 // float4 yf = vcvt_f32_s32(vmovl_s16(y));
9374 // float4 xf = vcvt_f32_s32(vmovl_s16(x));
9375 N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N0);
9377 N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
9378 N1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
9379
9380 // Use reciprocal estimate and one refinement step.
9381 // float4 recip = vrecpeq_f32(yf);
9382 // recip *= vrecpsq_f32(yf, recip);
9384 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
9385 N1);
9387 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
9388 N1, N2);
9389 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
9390 // Because short has a smaller range than ushort, we can actually get away
9391 // with only a single newton step. This requires that we use a weird bias
9392 // of 89, however (again, this has been exhaustively tested).
9393 // float4 result = as_float4(as_int4(xf*recip) + 0x89);
9394 N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
9395 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
9396 N1 = DAG.getConstant(0x89, dl, MVT::v4i32);
9397 N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
9398 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
9399 // Convert back to integer and return.
9400 // return vmovn_s32(vcvt_s32_f32(result));
9401 N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0);
9402 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0);
9403 return N0;
9404}
9405
9407 const ARMSubtarget *ST) {
9408 EVT VT = Op.getValueType();
9409 assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
9410 "unexpected type for custom-lowering ISD::SDIV");
9411
9412 SDLoc dl(Op);
9413 SDValue N0 = Op.getOperand(0);
9414 SDValue N1 = Op.getOperand(1);
9415 SDValue N2, N3;
9416
9417 if (VT == MVT::v8i8) {
9418 N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N0);
9420
9421 N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
9422 DAG.getIntPtrConstant(4, dl));
9424 DAG.getIntPtrConstant(4, dl));
9425 N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
9426 DAG.getIntPtrConstant(0, dl));
9428 DAG.getIntPtrConstant(0, dl));
9429
9430 N0 = LowerSDIV_v4i8(N0, N1, dl, DAG); // v4i16
9431 N2 = LowerSDIV_v4i8(N2, N3, dl, DAG); // v4i16
9432
9433 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
9434 N0 = LowerCONCAT_VECTORS(N0, DAG, ST);
9435
9436 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i8, N0);
9437 return N0;
9438 }
9439 return LowerSDIV_v4i16(N0, N1, dl, DAG);
9440}
9441
9443 const ARMSubtarget *ST) {
9444 // TODO: Should this propagate fast-math-flags?
9445 EVT VT = Op.getValueType();
9446 assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
9447 "unexpected type for custom-lowering ISD::UDIV");
9448
9449 SDLoc dl(Op);
9450 SDValue N0 = Op.getOperand(0);
9451 SDValue N1 = Op.getOperand(1);
9452 SDValue N2, N3;
9453
9454 if (VT == MVT::v8i8) {
9455 N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N0);
9457
9458 N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
9459 DAG.getIntPtrConstant(4, dl));
9461 DAG.getIntPtrConstant(4, dl));
9462 N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
9463 DAG.getIntPtrConstant(0, dl));
9465 DAG.getIntPtrConstant(0, dl));
9466
9467 N0 = LowerSDIV_v4i16(N0, N1, dl, DAG); // v4i16
9468 N2 = LowerSDIV_v4i16(N2, N3, dl, DAG); // v4i16
9469
9470 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
9471 N0 = LowerCONCAT_VECTORS(N0, DAG, ST);
9472
9474 DAG.getConstant(Intrinsic::arm_neon_vqmovnsu, dl,
9475 MVT::i32),
9476 N0);
9477 return N0;
9478 }
9479
9480 // v4i16 sdiv ... Convert to float.
9481 // float4 yf = vcvt_f32_s32(vmovl_u16(y));
9482 // float4 xf = vcvt_f32_s32(vmovl_u16(x));
9483 N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N0);
9485 N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
9487
9488 // Use reciprocal estimate and two refinement steps.
9489 // float4 recip = vrecpeq_f32(yf);
9490 // recip *= vrecpsq_f32(yf, recip);
9491 // recip *= vrecpsq_f32(yf, recip);
9493 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
9494 BN1);
9496 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
9497 BN1, N2);
9498 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
9500 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
9501 BN1, N2);
9502 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
9503 // Simply multiplying by the reciprocal estimate can leave us a few ulps
9504 // too low, so we add 2 ulps (exhaustive testing shows that this is enough,
9505 // and that it will never cause us to return an answer too large).
9506 // float4 result = as_float4(as_int4(xf*recip) + 2);
9507 N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
9508 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
9509 N1 = DAG.getConstant(2, dl, MVT::v4i32);
9510 N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
9511 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
9512 // Convert back to integer and return.
9513 // return vmovn_u32(vcvt_s32_f32(result));
9514 N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0);
9515 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0);
9516 return N0;
9517}
9518
9520 SDNode *N = Op.getNode();
9521 EVT VT = N->getValueType(0);
9522 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
9523
9524 SDValue Carry = Op.getOperand(2);
9525
9526 SDLoc DL(Op);
9527
9528 SDValue Result;
9529 if (Op.getOpcode() == ISD::ADDCARRY) {
9530 // This converts the boolean value carry into the carry flag.
9532
9533 // Do the addition proper using the carry flag we wanted.
9534 Result = DAG.getNode(ARMISD::ADDE, DL, VTs, Op.getOperand(0),
9535 Op.getOperand(1), Carry);
9536
9537 // Now convert the carry flag into a boolean value.
9538 Carry = ConvertCarryFlagToBooleanCarry(Result.getValue(1), VT, DAG);
9539 } else {
9540 // ARMISD::SUBE expects a carry not a borrow like ISD::SUBCARRY so we
9541 // have to invert the carry first.
9543 DAG.getConstant(1, DL, MVT::i32), Carry);
9544 // This converts the boolean value carry into the carry flag.
9546
9547 // Do the subtraction proper using the carry flag we wanted.
9548 Result = DAG.getNode(ARMISD::SUBE, DL, VTs, Op.getOperand(0),
9549 Op.getOperand(1), Carry);
9550
9551 // Now convert the carry flag into a boolean value.
9552 Carry = ConvertCarryFlagToBooleanCarry(Result.getValue(1), VT, DAG);
9553 // But the carry returned by ARMISD::SUBE is not a borrow as expected
9554 // by ISD::SUBCARRY, so compute 1 - C.
9556 DAG.getConstant(1, DL, MVT::i32), Carry);
9557 }
9558
9559 // Return both values.
9560 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Result, Carry);
9561}
9562
9563SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
9564 assert(Subtarget->isTargetDarwin());
9565
9566 // For iOS, we want to call an alternative entry point: __sincos_stret,
9567 // return values are passed via sret.
9568 SDLoc dl(Op);
9569 SDValue Arg = Op.getOperand(0);
9570 EVT ArgVT = Arg.getValueType();
9571 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
9572 auto PtrVT = getPointerTy(DAG.getDataLayout());
9573
9575 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9576
9577 // Pair of floats / doubles used to pass the result.
9579 auto &DL = DAG.getDataLayout();
9580
9581 ArgListTy Args;
9582 bool ShouldUseSRet = Subtarget->isAPCS_ABI();
9583 SDValue SRet;
9584 if (ShouldUseSRet) {
9585 // Create stack object for sret.
9586 const uint64_t ByteSize = DL.getTypeAllocSize(RetTy);
9587 const Align StackAlign = DL.getPrefTypeAlign(RetTy);
9588 int FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false);
9589 SRet = DAG.getFrameIndex(FrameIdx, TLI.getPointerTy(DL));
9590
9591 ArgListEntry Entry;
9592 Entry.Node = SRet;
9593 Entry.Ty = RetTy->getPointerTo();
9594 Entry.IsSExt = false;
9595 Entry.IsZExt = false;
9596 Entry.IsSRet = true;
9597 Args.push_back(Entry);
9599 }
9600
9601 ArgListEntry Entry;
9602 Entry.Node = Arg;
9603 Entry.Ty = ArgTy;
9604 Entry.IsSExt = false;
9605 Entry.IsZExt = false;
9606 Args.push_back(Entry);
9607
9609 (ArgVT == MVT::f64) ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
9610 const char *LibcallName = getLibcallName(LC);
9613
9615 CLI.setDebugLoc(dl)
9616 .setChain(DAG.getEntryNode())
9617 .setCallee(CC, RetTy, Callee, std::move(Args))
9619 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
9620
9621 if (!ShouldUseSRet)
9622 return CallResult.first;
9623
9625 DAG.getLoad(ArgVT, dl, CallResult.second, SRet, MachinePointerInfo());
9626
9627 // Address of cos field.
9628 SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, SRet,
9629 DAG.getIntPtrConstant(ArgVT.getStoreSize(), dl));
9631 DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add, MachinePointerInfo());
9632
9633 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
9634 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys,
9635 LoadSin.getValue(0), LoadCos.getValue(0));
9636}
9637
9638SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG,
9639 bool Signed,
9640 SDValue &Chain) const {
9641 EVT VT = Op.getValueType();
9642 assert((VT == MVT::i32 || VT == MVT::i64) &&
9643 "unexpected type for custom lowering DIV");
9644 SDLoc dl(Op);
9645
9646 const auto &DL = DAG.getDataLayout();
9647 const auto &TLI = DAG.getTargetLoweringInfo();
9648
9649 const char *Name = nullptr;
9650 if (Signed)
9651 Name = (VT == MVT::i32) ? "__rt_sdiv" : "__rt_sdiv64";
9652 else
9653 Name = (VT == MVT::i32) ? "__rt_udiv" : "__rt_udiv64";
9654
9656
9658
9659 for (auto AI : {1, 0}) {
9660 ArgListEntry Arg;
9661 Arg.Node = Op.getOperand(AI);
9662 Arg.Ty = Arg.Node.getValueType().getTypeForEVT(*DAG.getContext());
9663 Args.push_back(Arg);
9664 }
9665
9666 CallLoweringInfo CLI(DAG);
9667 CLI.setDebugLoc(dl)
9668 .setChain(Chain)
9670 ES, std::move(Args));
9671
9672 return LowerCallTo(CLI).first;
9673}
9674
9675// This is a code size optimisation: return the original SDIV node to
9676// DAGCombiner when we don't want to expand SDIV into a sequence of
9677// instructions, and an empty node otherwise which will cause the
9678// SDIV to be expanded in DAGCombine.
9679SDValue
9680ARMTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
9681 SelectionDAG &DAG,
9683 // TODO: Support SREM
9684 if (N->getOpcode() != ISD::SDIV)
9685 return SDValue();
9686
9687 const auto &ST = static_cast<const ARMSubtarget&>(DAG.getSubtarget());
9688 const bool MinSize = ST.hasMinSize();
9689 const bool HasDivide = ST.isThumb() ? ST.hasDivideInThumbMode()
9690 : ST.hasDivideInARMMode();
9691
9692 // Don't touch vector types; rewriting this may lead to scalarizing
9693 // the int divs.
9694 if (N->getOperand(0).getValueType().isVector())
9695 return SDValue();
9696
9697 // Bail if MinSize is not set, and also for both ARM and Thumb mode we need
9698 // hwdiv support for this to be really profitable.
9699 if (!(MinSize && HasDivide))
9700 return SDValue();
9701
9702 // ARM mode is a bit simpler than Thumb: we can handle large power
9703 // of 2 immediates with 1 mov instruction; no further checks required,
9704 // just return the sdiv node.
9705 if (!ST.isThumb())
9706 return SDValue(N, 0);
9707
9708 // In Thumb mode, immediates larger than 128 need a wide 4-byte MOV,
9709 // and thus lose the code size benefits of a MOVS that requires only 2.
9710 // TargetTransformInfo and 'getIntImmCodeSizeCost' could be helpful here,
9711 // but as it's doing exactly this, it's not worth the trouble to get TTI.
9712 if (Divisor.sgt(128))
9713 return SDValue();
9714
9715 return SDValue(N, 0);
9716}
9717
9718SDValue ARMTargetLowering::LowerDIV_Windows(SDValue Op, SelectionDAG &DAG,
9719 bool Signed) const {
9720 assert(Op.getValueType() == MVT::i32 &&
9721 "unexpected type for custom lowering DIV");
9722 SDLoc dl(Op);
9723
9725 DAG.getEntryNode(), Op.getOperand(1));
9726
9727 return LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK);
9728}
9729
9731 SDLoc DL(N);
9732 SDValue Op = N->getOperand(1);
9733 if (N->getValueType(0) == MVT::i32)
9734 return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain, Op);
9736 DAG.getConstant(0, DL, MVT::i32));
9738 DAG.getConstant(1, DL, MVT::i32));
9740 DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi));
9741}
9742
9743void ARMTargetLowering::ExpandDIV_Windows(
9744 SDValue Op, SelectionDAG &DAG, bool Signed,
9746 const auto &DL = DAG.getDataLayout();
9747 const auto &TLI = DAG.getTargetLoweringInfo();
9748
9749 assert(Op.getValueType() == MVT::i64 &&
9750 "unexpected type for custom lowering DIV");
9751 SDLoc dl(Op);
9752
9753 SDValue DBZCHK = WinDBZCheckDenominator(DAG, Op.getNode(), DAG.getEntryNode());
9754
9755 SDValue Result = LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK);
9756
9757 SDValue Lower = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Result);
9758 SDValue Upper = DAG.getNode(ISD::SRL, dl, MVT::i64, Result,
9759 DAG.getConstant(32, dl, TLI.getPointerTy(DL)));
9761
9762 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lower, Upper));
9763}
9764
9766 LoadSDNode *LD = cast<LoadSDNode>(Op.getNode());
9767 EVT MemVT = LD->getMemoryVT();
9769 "Expected a predicate type!");
9770 assert(MemVT == Op.getValueType());
9771 assert(LD->getExtensionType() == ISD::NON_EXTLOAD &&
9772 "Expected a non-extending load");
9773 assert(LD->isUnindexed() && "Expected a unindexed load");
9774
9775 // The basic MVE VLDR on a v4i1/v8i1 actually loads the entire 16bit
9776 // predicate, with the "v4i1" bits spread out over the 16 bits loaded. We
9777 // need to make sure that 8/4 bits are actually loaded into the correct
9778 // place, which means loading the value and then shuffling the values into
9779 // the bottom bits of the predicate.
9780 // Equally, VLDR for an v16i1 will actually load 32bits (so will be incorrect
9781 // for BE).
9782 // Speaking of BE, apparently the rest of llvm will assume a reverse order to
9783 // a natural VMSR(load), so needs to be reversed.
9784
9785 SDLoc dl(Op);
9786 SDValue Load = DAG.getExtLoad(
9787 ISD::EXTLOAD, dl, MVT::i32, LD->getChain(), LD->getBasePtr(),
9788 EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()),
9789 LD->getMemOperand());
9790 SDValue Val = Load;
9791 if (DAG.getDataLayout().isBigEndian())
9792 Val = DAG.getNode(ISD::SRL, dl, MVT::i32,
9793 DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, Load),
9794 DAG.getConstant(32 - MemVT.getSizeInBits(), dl, MVT::i32));
9795 SDValue Pred = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Val);
9796 if (MemVT != MVT::v16i1)
9797 Pred = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MemVT, Pred,
9798 DAG.getConstant(0, dl, MVT::i32));
9799 return DAG.getMergeValues({Pred, Load.getValue(1)}, dl);
9800}
9801
9802void ARMTargetLowering::LowerLOAD(SDNode *N, SmallVectorImpl<SDValue> &Results,
9803 SelectionDAG &DAG) const {
9805 EVT MemVT = LD->getMemoryVT();
9806 assert(LD->isUnindexed() && "Loads should be unindexed at this point.");
9807
9808 if (MemVT == MVT::i64 && Subtarget->hasV5TEOps() &&
9809 !Subtarget->isThumb1Only() && LD->isVolatile()) {
9810 SDLoc dl(N);
9812 ARMISD::LDRD, dl, DAG.getVTList({MVT::i32, MVT::i32, MVT::Other}),
9813 {LD->getChain(), LD->getBasePtr()}, MemVT, LD->getMemOperand());
9814 SDValue Lo = Result.getValue(DAG.getDataLayout().isLittleEndian() ? 0 : 1);
9815 SDValue Hi = Result.getValue(DAG.getDataLayout().isLittleEndian() ? 1 : 0);
9816 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
9817 Results.append({Pair, Result.getValue(2)});
9818 }
9819}
9820
9822 StoreSDNode *ST = cast<StoreSDNode>(Op.getNode());
9823 EVT MemVT = ST->getMemoryVT();
9825 "Expected a predicate type!");
9826 assert(MemVT == ST->getValue().getValueType());
9827 assert(!ST->isTruncatingStore() && "Expected a non-extending store");
9828 assert(ST->isUnindexed() && "Expected a unindexed store");
9829
9830 // Only store the v4i1 or v8i1 worth of bits, via a buildvector with top bits
9831 // unset and a scalar store.
9832 SDLoc dl(Op);
9833 SDValue Build = ST->getValue();
9834 if (MemVT != MVT::v16i1) {
9836 for (unsigned I = 0; I < MemVT.getVectorNumElements(); I++) {
9837 unsigned Elt = DAG.getDataLayout().isBigEndian()
9838 ? MemVT.getVectorNumElements() - I - 1
9839 : I;
9840 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Build,
9841 DAG.getConstant(Elt, dl, MVT::i32)));
9842 }
9843 for (unsigned I = MemVT.getVectorNumElements(); I < 16; I++)
9844 Ops.push_back(DAG.getUNDEF(MVT::i32));
9845 Build = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i1, Ops);
9846 }
9848 if (MemVT == MVT::v16i1 && DAG.getDataLayout().isBigEndian())
9849 GRP = DAG.getNode(ISD::SRL, dl, MVT::i32,
9851 DAG.getConstant(16, dl, MVT::i32));
9852 return DAG.getTruncStore(
9853 ST->getChain(), dl, GRP, ST->getBasePtr(),
9854 EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()),
9855 ST->getMemOperand());
9856}
9857
9859 const ARMSubtarget *Subtarget) {
9860 StoreSDNode *ST = cast<StoreSDNode>(Op.getNode());
9861 EVT MemVT = ST->getMemoryVT();
9862 assert(ST->isUnindexed() && "Stores should be unindexed at this point.");
9863
9864 if (MemVT == MVT::i64 && Subtarget->hasV5TEOps() &&
9865 !Subtarget->isThumb1Only() && ST->isVolatile()) {
9866 SDNode *N = Op.getNode();
9867 SDLoc dl(N);
9868
9869 SDValue Lo = DAG.getNode(
9870 ISD::EXTRACT_ELEMENT, dl, MVT::i32, ST->getValue(),
9871 DAG.getTargetConstant(DAG.getDataLayout().isLittleEndian() ? 0 : 1, dl,
9872 MVT::i32));
9873 SDValue Hi = DAG.getNode(
9874 ISD::EXTRACT_ELEMENT, dl, MVT::i32, ST->getValue(),
9875 DAG.getTargetConstant(DAG.getDataLayout().isLittleEndian() ? 1 : 0, dl,
9876 MVT::i32));
9877
9879 {ST->getChain(), Lo, Hi, ST->getBasePtr()},
9880 MemVT, ST->getMemOperand());
9881 } else if (Subtarget->hasMVEIntegerOps() &&
9882 ((MemVT == MVT::v4i1 || MemVT == MVT::v8i1 ||
9883 MemVT == MVT::v16i1))) {
9884 return LowerPredicateStore(Op, DAG);
9885 }
9886
9887 return SDValue();
9888}
9889
9890static bool isZeroVector(SDValue N) {
9891 return (ISD::isBuildVectorAllZeros(N.getNode()) ||
9892 (N->getOpcode() == ARMISD::VMOVIMM &&
9893 isNullConstant(N->getOperand(0))));
9894}
9895
9897 MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
9898 MVT VT = Op.getSimpleValueType();
9899 SDValue Mask = N->getMask();
9900 SDValue PassThru = N->getPassThru();
9901 SDLoc dl(Op);
9902
9904 return Op;
9905
9906 // MVE Masked loads use zero as the passthru value. Here we convert undef to
9907 // zero too, and other values are lowered to a select.
9908 SDValue ZeroVec = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
9909 DAG.getTargetConstant(0, dl, MVT::i32));
9911 VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask, ZeroVec,
9912 N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
9913 N->getExtensionType(), N->isExpandingLoad());
9915 bool PassThruIsCastZero = (PassThru.getOpcode() == ISD::BITCAST ||
9916 PassThru.getOpcode() == ARMISD::VECTOR_REG_CAST) &&
9917 isZeroVector(PassThru->getOperand(0));
9918 if (!PassThru.isUndef() && !PassThruIsCastZero)
9919 Combo = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
9920 return DAG.getMergeValues({Combo, NewLoad.getValue(1)}, dl);
9921}
9922
9924 const ARMSubtarget *ST) {
9925 if (!ST->hasMVEIntegerOps())
9926 return SDValue();
9927
9928 SDLoc dl(Op);
9929 unsigned BaseOpcode = 0;
9930 switch (Op->getOpcode()) {
9931 default: llvm_unreachable("Expected VECREDUCE opcode");
9932 case ISD::VECREDUCE_FADD: BaseOpcode = ISD::FADD; break;
9933 case ISD::VECREDUCE_FMUL: BaseOpcode = ISD::FMUL; break;
9934 case ISD::VECREDUCE_MUL: BaseOpcode = ISD::MUL; break;
9935 case ISD::VECREDUCE_AND: BaseOpcode = ISD::AND; break;
9936 case ISD::VECREDUCE_OR: BaseOpcode = ISD::OR; break;
9937 case ISD::VECREDUCE_XOR: BaseOpcode = ISD::XOR; break;
9938 case ISD::VECREDUCE_FMAX: BaseOpcode = ISD::FMAXNUM; break;
9939 case ISD::VECREDUCE_FMIN: BaseOpcode = ISD::FMINNUM; break;
9940 }
9941
9942 SDValue Op0 = Op->getOperand(0);
9943 EVT VT = Op0.getValueType();
9945 unsigned NumElts = VT.getVectorNumElements();
9946 unsigned NumActiveLanes = NumElts;
9947
9948 assert((NumActiveLanes == 16 || NumActiveLanes == 8 || NumActiveLanes == 4 ||
9949 NumActiveLanes == 2) &&
9950 "Only expected a power 2 vector size");
9951
9952 // Use Mul(X, Rev(X)) until 4 items remain. Going down to 4 vector elements
9953 // allows us to easily extract vector elements from the lanes.
9954 while (NumActiveLanes > 4) {
9956 SDValue Rev = DAG.getNode(RevOpcode, dl, VT, Op0);
9957 Op0 = DAG.getNode(BaseOpcode, dl, VT, Op0, Rev);
9958 NumActiveLanes /= 2;
9959 }
9960
9961 SDValue Res;
9962 if (NumActiveLanes == 4) {
9963 // The remaining 4 elements are summed sequentially
9965 DAG.getConstant(0 * NumElts / 4, dl, MVT::i32));
9967 DAG.getConstant(1 * NumElts / 4, dl, MVT::i32));
9969 DAG.getConstant(2 * NumElts / 4, dl, MVT::i32));
9971 DAG.getConstant(3 * NumElts / 4, dl, MVT::i32));
9972 SDValue Res0 = DAG.getNode(BaseOpcode, dl, EltVT, Ext0, Ext1, Op->getFlags());
9973 SDValue Res1 = DAG.getNode(BaseOpcode, dl, EltVT, Ext2, Ext3, Op->getFlags());
9974 Res = DAG.getNode(BaseOpcode, dl, EltVT, Res0, Res1, Op->getFlags());
9975 } else {
9977 DAG.getConstant(0, dl, MVT::i32));
9979 DAG.getConstant(1, dl, MVT::i32));
9980 Res = DAG.getNode(BaseOpcode, dl, EltVT, Ext0, Ext1, Op->getFlags());
9981 }
9982
9983 // Result type may be wider than element type.
9984 if (EltVT != Op->getValueType(0))
9985 Res = DAG.getNode(ISD::ANY_EXTEND, dl, Op->getValueType(0), Res);
9986 return Res;
9987}
9988
9990 const ARMSubtarget *ST) {
9991 if (!ST->hasMVEFloatOps())
9992 return SDValue();
9993 return LowerVecReduce(Op, DAG, ST);
9994}
9995
9997 if (isStrongerThanMonotonic(cast<AtomicSDNode>(Op)->getSuccessOrdering()))
9998 // Acquire/Release load/store is not legal for targets without a dmb or
9999 // equivalent available.
10000 return SDValue();
10001
10002 // Monotonic load/store is legal for all targets.
10003 return Op;
10004}
10005
10008 SelectionDAG &DAG,
10009 const ARMSubtarget *Subtarget) {
10010 SDLoc DL(N);
10011 // Under Power Management extensions, the cycle-count is:
10012 // mrc p15, #0, <Rt>, c9, c13, #0
10013 SDValue Ops[] = { N->getOperand(0), // Chain
10014 DAG.getTargetConstant(Intrinsic::arm_mrc, DL, MVT::i32),
10015 DAG.getTargetConstant(15, DL, MVT::i32),
10016 DAG.getTargetConstant(0, DL, MVT::i32),
10017 DAG.getTargetConstant(9, DL, MVT::i32),
10018 DAG.getTargetConstant(13, DL, MVT::i32),
10020 };
10021
10023 DAG.getVTList(MVT::i32, MVT::Other), Ops);
10025 DAG.getConstant(0, DL, MVT::i32)));
10026 Results.push_back(Cycles32.getValue(1));
10027}
10028
10030 SDLoc dl(V.getNode());
10031 SDValue VLo = DAG.getAnyExtOrTrunc(V, dl, MVT::i32);
10033 DAG.getNode(ISD::SRL, dl, MVT::i64, V, DAG.getConstant(32, dl, MVT::i32)),
10034 dl, MVT::i32);
10035 bool isBigEndian = DAG.getDataLayout().isBigEndian();
10036 if (isBigEndian)
10037 std::swap (VLo, VHi);
10038 SDValue RegClass =
10039 DAG.getTargetConstant(ARM::GPRPairRegClassID, dl, MVT::i32);
10040 SDValue SubReg0 = DAG.getTargetConstant(ARM::gsub_0, dl, MVT::i32);
10041 SDValue SubReg1 = DAG.getTargetConstant(ARM::gsub_1, dl, MVT::i32);
10042 const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 };
10043 return SDValue(
10044 DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0);
10045}
10046
10049 SelectionDAG &DAG) {
10050 assert(N->getValueType(0) == MVT::i64 &&
10051 "AtomicCmpSwap on types less than 64 should be legal");
10052 SDValue Ops[] = {N->getOperand(1),
10053 createGPRPairNode(DAG, N->getOperand(2)),
10054 createGPRPairNode(DAG, N->getOperand(3)),
10055 N->getOperand(0)};
10057 ARM::CMP_SWAP_64, SDLoc(N),
10059
10060 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
10062
10063 bool isBigEndian = DAG.getDataLayout().isBigEndian();
10064
10065 SDValue Lo =
10066 DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_1 : ARM::gsub_0,
10067 SDLoc(N), MVT::i32, SDValue(CmpSwap, 0));
10068 SDValue Hi =
10069 DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_0 : ARM::gsub_1,
10070 SDLoc(N), MVT::i32, SDValue(CmpSwap, 0));
10071 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i64, Lo, Hi));
10072 Results.push_back(SDValue(CmpSwap, 2));
10073}
10074
10075SDValue ARMTargetLowering::LowerFSETCC(SDValue Op, SelectionDAG &DAG) const {
10076 SDLoc dl(Op);
10077 EVT VT = Op.getValueType();
10078 SDValue Chain = Op.getOperand(0);
10079 SDValue LHS = Op.getOperand(1);
10080 SDValue RHS = Op.getOperand(2);
10081 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(3))->get();
10082 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
10083
10084 // If we don't have instructions of this float type then soften to a libcall
10085 // and use SETCC instead.
10086 if (isUnsupportedFloatingType(LHS.getValueType())) {
10088 DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS, Chain, IsSignaling);
10089 if (!RHS.getNode()) {
10090 RHS = DAG.getConstant(0, dl, LHS.getValueType());
10091 CC = ISD::SETNE;
10092 }
10093 SDValue Result = DAG.getNode(ISD::SETCC, dl, VT, LHS, RHS,
10094 DAG.getCondCode(CC));
10095 return DAG.getMergeValues({Result, Chain}, dl);
10096 }
10097
10098 ARMCC::CondCodes CondCode, CondCode2;
10099 FPCCToARMCC(CC, CondCode, CondCode2);
10100
10101 // FIXME: Chain is not handled correctly here. Currently the FPSCR is implicit
10102 // in CMPFP and CMPFPE, but instead it should be made explicit by these
10103 // instructions using a chain instead of glue. This would also fix the problem
10104 // here (and also in LowerSELECT_CC) where we generate two comparisons when
10105 // CondCode2 != AL.
10106 SDValue True = DAG.getConstant(1, dl, VT);
10107 SDValue False = DAG.getConstant(0, dl, VT);
10108 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
10109 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
10110 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl, IsSignaling);
10111 SDValue Result = getCMOV(dl, VT, False, True, ARMcc, CCR, Cmp, DAG);
10112 if (CondCode2 != ARMCC::AL) {
10113 ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32);
10114 Cmp = getVFPCmp(LHS, RHS, DAG, dl, IsSignaling);
10115 Result = getCMOV(dl, VT, Result, True, ARMcc, CCR, Cmp, DAG);
10116 }
10117 return DAG.getMergeValues({Result, Chain}, dl);
10118}
10119
10121 LLVM_DEBUG(dbgs() << "Lowering node: "; Op.dump());
10122 switch (Op.getOpcode()) {
10123 default: llvm_unreachable("Don't know how to custom lower this!");
10124 case ISD::WRITE_REGISTER: return LowerWRITE_REGISTER(Op, DAG);
10125 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
10126 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
10127 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
10128 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
10129 case ISD::SELECT: return LowerSELECT(Op, DAG);
10130 case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
10131 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
10132 case ISD::BR_CC: return LowerBR_CC(Op, DAG);
10133 case ISD::BR_JT: return LowerBR_JT(Op, DAG);
10134 case ISD::VASTART: return LowerVASTART(Op, DAG);
10135 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, DAG, Subtarget);
10136 case ISD::PREFETCH: return LowerPREFETCH(Op, DAG, Subtarget);
10137 case ISD::SINT_TO_FP:
10138 case ISD::UINT_TO_FP: return LowerINT_TO_FP(Op, DAG);
10141 case ISD::FP_TO_SINT:
10142 case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
10143 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
10144 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
10145 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
10146 case ISD::EH_SJLJ_SETJMP: return LowerEH_SJLJ_SETJMP(Op, DAG);
10147 case ISD::EH_SJLJ_LONGJMP: return LowerEH_SJLJ_LONGJMP(Op, DAG);
10148 case ISD::EH_SJLJ_SETUP_DISPATCH: return LowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
10149 case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG, Subtarget);
10150 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG,
10151 Subtarget);
10152 case ISD::BITCAST: return ExpandBITCAST(Op.getNode(), DAG, Subtarget);
10153 case ISD::SHL:
10154 case ISD::SRL:
10155 case ISD::SRA: return LowerShift(Op.getNode(), DAG, Subtarget);
10156 case ISD::SREM: return LowerREM(Op.getNode(), DAG);
10157 case ISD::UREM: return LowerREM(Op.getNode(), DAG);
10158 case ISD::SHL_PARTS: return LowerShiftLeftParts(Op, DAG);
10159 case ISD::SRL_PARTS:
10160 case ISD::SRA_PARTS: return LowerShiftRightParts(Op, DAG);
10161 case ISD::CTTZ:
10162 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op.getNode(), DAG, Subtarget);
10163 case ISD::CTPOP: return LowerCTPOP(Op.getNode(), DAG, Subtarget);
10164 case ISD::SETCC: return LowerVSETCC(Op, DAG, Subtarget);
10165 case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
10166 case ISD::ConstantFP: return LowerConstantFP(Op, DAG, Subtarget);
10167 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG, Subtarget);
10168 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG, Subtarget);
10169 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG, Subtarget);
10170 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
10171 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG, Subtarget);
10172 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG, Subtarget);
10173 case ISD::TRUNCATE: return LowerTruncate(Op.getNode(), DAG, Subtarget);
10174 case ISD::SIGN_EXTEND:
10175 case ISD::ZERO_EXTEND: return LowerVectorExtend(Op.getNode(), DAG, Subtarget);
10176 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
10177 case ISD::SET_ROUNDING: return LowerSET_ROUNDING(Op, DAG);
10178 case ISD::MUL: return LowerMUL(Op, DAG);
10179 case ISD::SDIV:
10180 if (Subtarget->isTargetWindows() && !Op.getValueType().isVector())
10181 return LowerDIV_Windows(Op, DAG, /* Signed */ true);
10182 return LowerSDIV(Op, DAG, Subtarget);
10183 case ISD::UDIV:
10184 if (Subtarget->isTargetWindows() && !Op.getValueType().isVector())
10185 return LowerDIV_Windows(Op, DAG, /* Signed */ false);
10186 return LowerUDIV(Op, DAG, Subtarget);
10187 case ISD::ADDCARRY:
10188 case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG);
10189 case ISD::SADDO:
10190 case ISD::SSUBO:
10191 return LowerSignedALUO(Op, DAG);
10192 case ISD::UADDO:
10193 case ISD::USUBO:
10194 return LowerUnsignedALUO(Op, DAG);
10195 case ISD::SADDSAT:
10196 case ISD::SSUBSAT:
10197 case ISD::UADDSAT:
10198 case ISD::USUBSAT:
10199 return LowerADDSUBSAT(Op, DAG, Subtarget);
10200 case ISD::LOAD:
10201 return LowerPredicateLoad(Op, DAG);
10202 case ISD::STORE:
10203 return LowerSTORE(Op, DAG, Subtarget);
10204 case ISD::MLOAD:
10205 return LowerMLOAD(Op, DAG);
10206 case ISD::VECREDUCE_MUL:
10207 case ISD::VECREDUCE_AND:
10208 case ISD::VECREDUCE_OR:
10209 case ISD::VECREDUCE_XOR:
10210 return LowerVecReduce(Op, DAG, Subtarget);
10215 return LowerVecReduceF(Op, DAG, Subtarget);
10216 case ISD::ATOMIC_LOAD:
10217 case ISD::ATOMIC_STORE: return LowerAtomicLoadStore(Op, DAG);
10218 case ISD::FSINCOS: return LowerFSINCOS(Op, DAG);
10219 case ISD::SDIVREM:
10220 case ISD::UDIVREM: return LowerDivRem(Op, DAG);
10222 if (Subtarget->isTargetWindows())
10223 return LowerDYNAMIC_STACKALLOC(Op, DAG);
10224 llvm_unreachable("Don't know how to custom lower this!");
10226 case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG);
10228 case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
10229 case ISD::STRICT_FSETCC:
10230 case ISD::STRICT_FSETCCS: return LowerFSETCC(Op, DAG);
10231 case ARMISD::WIN__DBZCHK: return SDValue();
10232 }
10233}
10234
10236 SelectionDAG &DAG) {
10237 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
10238 unsigned Opc = 0;
10239 if (IntNo == Intrinsic::arm_smlald)
10240 Opc = ARMISD::SMLALD;
10241 else if (IntNo == Intrinsic::arm_smlaldx)
10242 Opc = ARMISD::SMLALDX;
10243 else if (IntNo == Intrinsic::arm_smlsld)
10244 Opc = ARMISD::SMLSLD;
10245 else if (IntNo == Intrinsic::arm_smlsldx)
10246 Opc = ARMISD::SMLSLDX;
10247 else
10248 return;
10249
10250 SDLoc dl(N);
10252 N->getOperand(3),
10253 DAG.getConstant(0, dl, MVT::i32));
10255 N->getOperand(3),
10256 DAG.getConstant(1, dl, MVT::i32));
10257
10258 SDValue LongMul = DAG.getNode(Opc, dl,
10260 N->getOperand(1), N->getOperand(2),
10261 Lo, Hi);
10262 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64,
10263 LongMul.getValue(0), LongMul.getValue(1)));
10264}
10265
10266/// ReplaceNodeResults - Replace the results of node with an illegal result
10267/// type with new values built out of custom code.
10270 SelectionDAG &DAG) const {
10271 SDValue Res;
10272 switch (N->getOpcode()) {
10273 default:
10274 llvm_unreachable("Don't know how to custom expand this!");
10275 case ISD::READ_REGISTER:
10277 break;
10278 case ISD::BITCAST:
10279 Res = ExpandBITCAST(N, DAG, Subtarget);
10280 break;
10281 case ISD::SRL:
10282 case ISD::SRA:
10283 case ISD::SHL:
10284 Res = Expand64BitShift(N, DAG, Subtarget);
10285 break;
10286 case ISD::SREM:
10287 case ISD::UREM:
10288 Res = LowerREM(N, DAG);
10289 break;
10290 case ISD::SDIVREM:
10291 case ISD::UDIVREM:
10292 Res = LowerDivRem(SDValue(N, 0), DAG);
10293 assert(Res.getNumOperands() == 2 && "DivRem needs two values");
10294 Results.push_back(Res.getValue(0));
10295 Results.push_back(Res.getValue(1));
10296 return;
10297 case ISD::SADDSAT:
10298 case ISD::SSUBSAT:
10299 case ISD::UADDSAT:
10300 case ISD::USUBSAT:
10301 Res = LowerADDSUBSAT(SDValue(N, 0), DAG, Subtarget);
10302 break;
10304 ReplaceREADCYCLECOUNTER(N, Results, DAG, Subtarget);
10305 return;
10306 case ISD::UDIV:
10307 case ISD::SDIV:
10308 assert(Subtarget->isTargetWindows() && "can only expand DIV on Windows");
10309 return ExpandDIV_Windows(SDValue(N, 0), DAG, N->getOpcode() == ISD::SDIV,
10310 Results);
10313 return;
10315 return ReplaceLongIntrinsic(N, Results, DAG);
10316 case ISD::ABS:
10317 lowerABS(N, Results, DAG);
10318 return ;
10319 case ISD::LOAD:
10320 LowerLOAD(N, Results, DAG);
10321 break;
10322 case ISD::TRUNCATE:
10323 Res = LowerTruncate(N, DAG, Subtarget);
10324 break;
10325 case ISD::SIGN_EXTEND:
10326 case ISD::ZERO_EXTEND:
10327 Res = LowerVectorExtend(N, DAG, Subtarget);
10328 break;
10329 }
10330 if (Res.getNode())
10331 Results.push_back(Res);
10332}
10333
10334//===----------------------------------------------------------------------===//
10335// ARM Scheduler Hooks
10336//===----------------------------------------------------------------------===//
10337
10338/// SetupEntryBlockForSjLj - Insert code into the entry block that creates and
10339/// registers the function context.
10340void ARMTargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
10343 int FI) const {
10344 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
10345 "ROPI/RWPI not currently supported with SjLj");
10346 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
10347 DebugLoc dl = MI.getDebugLoc();
10348 MachineFunction *MF = MBB->getParent();
10352 const Function &F = MF->getFunction();
10353
10354 bool isThumb = Subtarget->isThumb();
10355 bool isThumb2 = Subtarget->isThumb2();
10356
10357 unsigned PCLabelId = AFI->createPICLabelUId();
10358 unsigned PCAdj = (isThumb || isThumb2) ? 4 : 8;
10361 unsigned CPI = MCP->getConstantPoolIndex(CPV, Align(4));
10362
10363 const TargetRegisterClass *TRC = isThumb ? &ARM::tGPRRegClass
10364 : &ARM::GPRRegClass;
10365
10366 // Grab constant pool and fixed stack memory operands.
10370
10374
10375 // Load the address of the dispatch MBB into the jump buffer.
10376 if (isThumb2) {
10377 // Incoming value: jbuf
10378 // ldr.n r5, LCPI1_1
10379 // orr r5, r5, #1
10380 // add r5, pc
10381 // str r5, [$jbuf, #+4] ; &jbuf[1]
10382 Register NewVReg1 = MRI->createVirtualRegister(TRC);
10383 BuildMI(*MBB, MI, dl, TII->get(ARM::t2LDRpci), NewVReg1)
10387 // Set the low bit because of thumb mode.
10388 Register NewVReg2 = MRI->createVirtualRegister(TRC);
10389 BuildMI(*MBB, MI, dl, TII->get(ARM::t2ORRri), NewVReg2)
10391 .addImm(0x01)
10393 .add(condCodeOp());
10394 Register NewVReg3 = MRI->createVirtualRegister(TRC);
10395 BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg3)
10397 .addImm(PCLabelId);
10398 BuildMI(*MBB, MI, dl, TII->get(ARM::t2STRi12))
10400 .addFrameIndex(FI)
10401 .addImm(36) // &jbuf[1] :: pc
10404 } else if (isThumb) {
10405 // Incoming value: jbuf
10406 // ldr.n r1, LCPI1_4
10407 // add r1, pc
10408 // mov r2, #1
10409 // orrs r1, r2
10410 // add r2, $jbuf, #+4 ; &jbuf[1]
10411 // str r1, [r2]
10412 Register NewVReg1 = MRI->createVirtualRegister(TRC);
10413 BuildMI(*MBB, MI, dl, TII->get(ARM::tLDRpci), NewVReg1)
10417 Register NewVReg2 = MRI->createVirtualRegister(TRC);
10418 BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg2)
10420 .addImm(PCLabelId);
10421 // Set the low bit because of thumb mode.
10422 Register NewVReg3 = MRI->createVirtualRegister(TRC);
10423 BuildMI(*MBB, MI, dl, TII->get(ARM::tMOVi8), NewVReg3)
10424 .addReg(ARM::CPSR, RegState::Define)
10425 .addImm(1)
10427 Register NewVReg4 = MRI->createVirtualRegister(TRC);
10428 BuildMI(*MBB, MI, dl, TII->get(ARM::tORR), NewVReg4)
10429 .addReg(ARM::CPSR, RegState::Define)
10433 Register NewVReg5 = MRI->createVirtualRegister(TRC);
10434 BuildMI(*MBB, MI, dl, TII->get(ARM::tADDframe), NewVReg5)
10435 .addFrameIndex(FI)
10436 .addImm(36); // &jbuf[1] :: pc
10437 BuildMI(*MBB, MI, dl, TII->get(ARM::tSTRi))
10440 .addImm(0)
10443 } else {
10444 // Incoming value: jbuf
10445 // ldr r1, LCPI1_1
10446 // add r1, pc, r1
10447 // str r1, [$jbuf, #+4] ; &jbuf[1]
10448 Register NewVReg1 = MRI->createVirtualRegister(TRC);
10449 BuildMI(*MBB, MI, dl, TII->get(ARM::LDRi12), NewVReg1)
10451 .addImm(0)
10454 Register NewVReg2 = MRI->createVirtualRegister(TRC);
10455 BuildMI(*MBB, MI, dl, TII->get(ARM::PICADD), NewVReg2)
10459 BuildMI(*MBB, MI, dl, TII->get(ARM::STRi12))
10461 .addFrameIndex(FI)
10462 .addImm(36) // &jbuf[1] :: pc
10465 }
10466}
10467
10468void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
10469 MachineBasicBlock *MBB) const {
10470 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
10471 DebugLoc dl = MI.getDebugLoc();
10472 MachineFunction *MF = MBB->getParent();
10474 MachineFrameInfo &MFI = MF->getFrameInfo();
10475 int FI = MFI.getFunctionContextIndex();
10476
10477 const TargetRegisterClass *TRC = Subtarget->isThumb() ? &ARM::tGPRRegClass
10478 : &ARM::GPRnopcRegClass;
10479
10480 // Get a mapping of the call site numbers to all of the landing pads they're
10481 // associated with.
10483 unsigned MaxCSNum = 0;
10484 for (MachineFunction::iterator BB = MF->begin(), E = MF->end(); BB != E;
10485 ++BB) {
10486 if (!BB->isEHPad()) continue;
10487
10488 // FIXME: We should assert that the EH_LABEL is the first MI in the landing
10489 // pad.
10491 II = BB->begin(), IE = BB->end(); II != IE; ++II) {
10492 if (!II->isEHLabel()) continue;
10493
10494 MCSymbol *Sym = II->getOperand(0).getMCSymbol();
10495 if (!MF->hasCallSiteLandingPad(Sym)) continue;
10496
10499 CSI = CallSiteIdxs.begin(), CSE = CallSiteIdxs.end();
10500 CSI != CSE; ++CSI) {
10501 CallSiteNumToLPad[*CSI].push_back(&*BB);
10502 MaxCSNum = std::max(MaxCSNum, *CSI);
10503 }
10504 break;
10505 }
10506 }
10507
10508 // Get an ordered list of the machine basic blocks for the jump table.
10509 std::vector<MachineBasicBlock*> LPadList;
10511 LPadList.reserve(CallSiteNumToLPad.size());
10512 for (unsigned I = 1; I <= MaxCSNum; ++I) {
10515 II = MBBList.begin(), IE = MBBList.end(); II != IE; ++II) {
10516 LPadList.push_back(*II);
10517 InvokeBBs.insert((*II)->pred_begin(), (*II)->pred_end());
10518 }
10519 }
10520
10521 assert(!LPadList.empty() &&
10522 "No landing pad destinations for the dispatch jump table!");
10523
10524 // Create the jump table and associated information.
10527 unsigned MJTI = JTI->createJumpTableIndex(LPadList);
10528
10529 // Create the MBBs for the dispatch code.
10530
10531 // Shove the dispatch's address into the return slot in the function context.
10533 DispatchBB->setIsEHPad();
10534
10536 unsigned trap_opcode;
10537 if (Subtarget->isThumb())
10538 trap_opcode = ARM::tTRAP;
10539 else
10540 trap_opcode = Subtarget->useNaClTrap() ? ARM::TRAPNaCl : ARM::TRAP;
10541
10542 BuildMI(TrapBB, dl, TII->get(trap_opcode));
10543 DispatchBB->addSuccessor(TrapBB);
10544
10546 DispatchBB->addSuccessor(DispContBB);
10547
10548 // Insert and MBBs.
10549 MF->insert(MF->end(), DispatchBB);
10550 MF->insert(MF->end(), DispContBB);
10551 MF->insert(MF->end(), TrapBB);
10552
10553 // Insert code into the entry block that creates and registers the function
10554 // context.
10555 SetupEntryBlockForSjLj(MI, MBB, DispatchBB, FI);
10556
10560
10562 MIB = BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup));
10563
10564 const ARMBaseInstrInfo *AII = static_cast<const ARMBaseInstrInfo*>(TII);
10565 const ARMBaseRegisterInfo &RI = AII->getRegisterInfo();
10566
10567 // Add a register mask with no preserved registers. This results in all
10568 // registers being marked as clobbered. This can't work if the dispatch block
10569 // is in a Thumb1 function and is linked with ARM code which uses the FP
10570 // registers, as there is no way to preserve the FP registers in Thumb1 mode.
10572
10574 unsigned NumLPads = LPadList.size();
10575 if (Subtarget->isThumb2()) {
10576 Register NewVReg1 = MRI->createVirtualRegister(TRC);
10577 BuildMI(DispatchBB, dl, TII->get(ARM::t2LDRi12), NewVReg1)
10578 .addFrameIndex(FI)
10579 .addImm(4)
10582
10583 if (NumLPads < 256) {
10584 BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPri))
10586 .addImm(LPadList.size())
10588 } else {
10589 Register VReg1 = MRI->createVirtualRegister(TRC);
10590 BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVi16), VReg1)
10591 .addImm(NumLPads & 0xFFFF)
10593
10594 unsigned VReg2 = VReg1;
10595 if ((NumLPads & 0xFFFF0000) != 0) {
10596 VReg2 = MRI->createVirtualRegister(TRC);
10597 BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVTi16), VReg2)
10598 .addReg(VReg1)
10599 .addImm(NumLPads >> 16)
10601 }
10602
10603 BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPrr))
10605 .addReg(VReg2)
10607 }
10608
10609 BuildMI(DispatchBB, dl, TII->get(ARM::t2Bcc))
10610 .addMBB(TrapBB)
10612 .addReg(ARM::CPSR);
10613
10614 Register NewVReg3 = MRI->createVirtualRegister(TRC);
10615 BuildMI(DispContBB, dl, TII->get(ARM::t2LEApcrelJT), NewVReg3)
10618
10619 Register NewVReg4 = MRI->createVirtualRegister(TRC);
10620 BuildMI(DispContBB, dl, TII->get(ARM::t2ADDrs), NewVReg4)
10625 .add(condCodeOp());
10626
10627 BuildMI(DispContBB, dl, TII->get(ARM::t2BR_JT))
10631 } else if (Subtarget->isThumb()) {
10632 Register NewVReg1 = MRI->createVirtualRegister(TRC);
10633 BuildMI(DispatchBB, dl, TII->get(ARM::tLDRspi), NewVReg1)
10634 .addFrameIndex(FI)
10635 .addImm(1)
10638
10639 if (NumLPads < 256) {
10640 BuildMI(DispatchBB, dl, TII->get(ARM::tCMPi8))
10644 } else {
10648
10649 // MachineConstantPool wants an explicit alignment.
10650 Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty);
10651 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment);
10652
10653 Register VReg1 = MRI->createVirtualRegister(TRC);
10654 BuildMI(DispatchBB, dl, TII->get(ARM::tLDRpci))
10658 BuildMI(DispatchBB, dl, TII->get(ARM::tCMPr))
10660 .addReg(VReg1)
10662 }
10663
10664 BuildMI(DispatchBB, dl, TII->get(ARM::tBcc))
10665 .addMBB(TrapBB)
10667 .addReg(ARM::CPSR);
10668
10669 Register NewVReg2 = MRI->createVirtualRegister(TRC);
10670 BuildMI(DispContBB, dl, TII->get(ARM::tLSLri), NewVReg2)
10671 .addReg(ARM::CPSR, RegState::Define)
10673 .addImm(2)
10675
10676 Register NewVReg3 = MRI->createVirtualRegister(TRC);
10677 BuildMI(DispContBB, dl, TII->get(ARM::tLEApcrelJT), NewVReg3)
10680
10681 Register NewVReg4 = MRI->createVirtualRegister(TRC);
10682 BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg4)
10683 .addReg(ARM::CPSR, RegState::Define)
10687
10691
10692 Register NewVReg5 = MRI->createVirtualRegister(TRC);
10693 BuildMI(DispContBB, dl, TII->get(ARM::tLDRi), NewVReg5)
10695 .addImm(0)
10698
10699 unsigned NewVReg6 = NewVReg5;
10701 NewVReg6 = MRI->createVirtualRegister(TRC);
10702 BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg6)
10703 .addReg(ARM::CPSR, RegState::Define)
10707 }
10708
10709 BuildMI(DispContBB, dl, TII->get(ARM::tBR_JTr))
10712 } else {
10713 Register NewVReg1 = MRI->createVirtualRegister(TRC);
10714 BuildMI(DispatchBB, dl, TII->get(ARM::LDRi12), NewVReg1)
10715 .addFrameIndex(FI)
10716 .addImm(4)
10719
10720 if (NumLPads < 256) {
10721 BuildMI(DispatchBB, dl, TII->get(ARM::CMPri))
10725 } else if (Subtarget->hasV6T2Ops() && isUInt<16>(NumLPads)) {
10726 Register VReg1 = MRI->createVirtualRegister(TRC);
10727 BuildMI(DispatchBB, dl, TII->get(ARM::MOVi16), VReg1)
10728 .addImm(NumLPads & 0xFFFF)
10730
10731 unsigned VReg2 = VReg1;
10732 if ((NumLPads & 0xFFFF0000) != 0) {
10733 VReg2 = MRI->createVirtualRegister(TRC);
10734 BuildMI(DispatchBB, dl, TII->get(ARM::MOVTi16), VReg2)
10735 .addReg(VReg1)
10736 .addImm(NumLPads >> 16)
10738 }
10739
10740 BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))
10742 .addReg(VReg2)
10744 } else {
10748
10749 // MachineConstantPool wants an explicit alignment.
10750 Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty);
10751 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment);
10752
10753 Register VReg1 = MRI->createVirtualRegister(TRC);
10754 BuildMI(DispatchBB, dl, TII->get(ARM::LDRcp))
10757 .addImm(0)
10759 BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))
10763 }
10764
10765 BuildMI(DispatchBB, dl, TII->get(ARM::Bcc))
10766 .addMBB(TrapBB)
10768 .addReg(ARM::CPSR);
10769
10770 Register NewVReg3 = MRI->createVirtualRegister(TRC);
10771 BuildMI(DispContBB, dl, TII->get(ARM::MOVsi), NewVReg3)
10775 .add(condCodeOp());
10776 Register NewVReg4 = MRI->createVirtualRegister(TRC);
10777 BuildMI(DispContBB, dl, TII->get(ARM::LEApcrelJT), NewVReg4)
10780
10784 Register NewVReg5 = MRI->createVirtualRegister(TRC);
10785 BuildMI(DispContBB, dl, TII->get(ARM::LDRrs), NewVReg5)
10788 .addImm(0)
10791
10793 BuildMI(DispContBB, dl, TII->get(ARM::BR_JTadd))
10797 } else {
10798 BuildMI(DispContBB, dl, TII->get(ARM::BR_JTr))
10801 }
10802 }
10803
10804 // Add the jump table entries as successors to the MBB.
10806 for (std::vector<MachineBasicBlock*>::iterator
10807 I = LPadList.begin(), E = LPadList.end(); I != E; ++I) {
10808 MachineBasicBlock *CurMBB = *I;
10809 if (SeenMBBs.insert(CurMBB).second)
10810 DispContBB->addSuccessor(CurMBB);
10811 }
10812
10813 // N.B. the order the invoke BBs are processed in doesn't matter here.
10814 const MCPhysReg *SavedRegs = RI.getCalleeSavedRegs(MF);
10816 for (MachineBasicBlock *BB : InvokeBBs) {
10817
10818 // Remove the landing pad successor from the invoke block and replace it
10819 // with the new dispatch block.
10820 SmallVector<MachineBasicBlock*, 4> Successors(BB->successors());
10821 while (!Successors.empty()) {
10822 MachineBasicBlock *SMBB = Successors.pop_back_val();
10823 if (SMBB->isEHPad()) {
10824 BB->removeSuccessor(SMBB);
10825 MBBLPads.push_back(SMBB);
10826 }
10827 }
10828
10829 BB->addSuccessor(DispatchBB, BranchProbability::getZero());
10830 BB->normalizeSuccProbs();
10831
10832 // Find the invoke call and mark all of the callee-saved registers as
10833 // 'implicit defined' so that they're spilled. This prevents code from
10834 // moving instructions to before the EH block, where they will never be
10835 // executed.
10837 II = BB->rbegin(), IE = BB->rend(); II != IE; ++II) {
10838 if (!II->isCall()) continue;
10839
10842 OI = II->operands_begin(), OE = II->operands_end();
10843 OI != OE; ++OI) {
10844 if (!OI->isReg()) continue;
10845 DefRegs[OI->getReg()] = true;
10846 }
10847
10848 MachineInstrBuilder MIB(*MF, &*II);
10849
10850 for (unsigned i = 0; SavedRegs[i] != 0; ++i) {
10851 unsigned Reg = SavedRegs[i];
10852 if (Subtarget->isThumb2() &&
10853 !ARM::tGPRRegClass.contains(Reg) &&
10854 !ARM::hGPRRegClass.contains(Reg))
10855 continue;
10856 if (Subtarget->isThumb1Only() && !ARM::tGPRRegClass.contains(Reg))
10857 continue;
10858 if (!Subtarget->isThumb() && !ARM::GPRRegClass.contains(Reg))
10859 continue;
10860 if (!DefRegs[Reg])
10862 }
10863
10864 break;
10865 }
10866 }
10867
10868 // Mark all former landing pads as non-landing pads. The dispatch is the only
10869 // landing pad now.
10871 I = MBBLPads.begin(), E = MBBLPads.end(); I != E; ++I)
10872 (*I)->setIsEHPad(false);
10873
10874 // The instruction is gone now.
10875 MI.eraseFromParent();
10876}
10877
10878static
10881 E = MBB->succ_end(); I != E; ++I)
10882 if (*I != Succ)
10883 return *I;
10884 llvm_unreachable("Expecting a BB with two successors!");
10885}
10886
10887/// Return the load opcode for a given load size. If load size >= 8,
10888/// neon opcode will be returned.
10889static unsigned getLdOpcode(unsigned LdSize, bool IsThumb1, bool IsThumb2) {
10890 if (LdSize >= 8)
10891 return LdSize == 16 ? ARM::VLD1q32wb_fixed
10892 : LdSize == 8 ? ARM::VLD1d32wb_fixed : 0;
10893 if (IsThumb1)
10894 return LdSize == 4 ? ARM::tLDRi
10895 : LdSize == 2 ? ARM::tLDRHi
10896 : LdSize == 1 ? ARM::tLDRBi : 0;
10897 if (IsThumb2)
10898 return LdSize == 4 ? ARM::t2LDR_POST
10899 : LdSize == 2 ? ARM::t2LDRH_POST
10900 : LdSize == 1 ? ARM::t2LDRB_POST : 0;
10901 return LdSize == 4 ? ARM::LDR_POST_IMM
10902 : LdSize == 2 ? ARM::LDRH_POST
10903 : LdSize == 1 ? ARM::LDRB_POST_IMM : 0;
10904}
10905
10906/// Return the store opcode for a given store size. If store size >= 8,
10907/// neon opcode will be returned.
10908static unsigned getStOpcode(unsigned StSize, bool IsThumb1, bool IsThumb2) {
10909 if (StSize >= 8)
10910 return StSize == 16 ? ARM::VST1q32wb_fixed
10911 : StSize == 8 ? ARM::VST1d32wb_fixed : 0;
10912 if (IsThumb1)
10913 return StSize == 4 ? ARM::tSTRi
10914 : StSize == 2 ? ARM::tSTRHi
10915 : StSize == 1 ? ARM::tSTRBi : 0;
10916 if (IsThumb2)
10917 return StSize == 4 ? ARM::t2STR_POST
10918 : StSize == 2 ? ARM::t2STRH_POST
10919 : StSize == 1 ? ARM::t2STRB_POST : 0;
10920 return StSize == 4 ? ARM::STR_POST_IMM
10921 : StSize == 2 ? ARM::STRH_POST
10922 : StSize == 1 ? ARM::STRB_POST_IMM : 0;
10923}
10924
10925/// Emit a post-increment load operation with given size. The instructions
10926/// will be added to BB at Pos.
10928 const TargetInstrInfo *TII, const DebugLoc &dl,
10929 unsigned LdSize, unsigned Data, unsigned AddrIn,
10930 unsigned AddrOut, bool IsThumb1, bool IsThumb2) {
10932 assert(LdOpc != 0 && "Should have a load opcode");
10933 if (LdSize >= 8) {
10934 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
10936 .addReg(AddrIn)
10937 .addImm(0)
10939 } else if (IsThumb1) {
10940 // load + update AddrIn
10941 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
10942 .addReg(AddrIn)
10943 .addImm(0)
10945 BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut)
10946 .add(t1CondCodeOp())
10947 .addReg(AddrIn)
10948 .addImm(LdSize)
10950 } else if (IsThumb2) {
10951 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
10953 .addReg(AddrIn)
10954 .addImm(LdSize)
10956 } else { // arm
10957 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
10959 .addReg(AddrIn)
10960 .addReg(0)
10961 .addImm(LdSize)
10963 }
10964}
10965
10966/// Emit a post-increment store operation with given size. The instructions
10967/// will be added to BB at Pos.
10969 const TargetInstrInfo *TII, const DebugLoc &dl,
10970 unsigned StSize, unsigned Data, unsigned AddrIn,
10971 unsigned AddrOut, bool IsThumb1, bool IsThumb2) {
10973 assert(StOpc != 0 && "Should have a store opcode");
10974 if (StSize >= 8) {
10975 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
10976 .addReg(AddrIn)
10977 .addImm(0)
10978 .addReg(Data)
10980 } else if (IsThumb1) {
10981 // store + update AddrIn
10982 BuildMI(*BB, Pos, dl, TII->get(StOpc))
10983 .addReg(Data)
10984 .addReg(AddrIn)
10985 .addImm(0)
10987 BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut)
10988 .add(t1CondCodeOp())
10989 .addReg(AddrIn)
10990 .addImm(StSize)
10992 } else if (IsThumb2) {
10993 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
10994 .addReg(Data)
10995 .addReg(AddrIn)
10996 .addImm(StSize)
10998 } else { // arm
10999 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
11000 .addReg(Data)
11001 .addReg(AddrIn)
11002 .addReg(0)
11003 .addImm(StSize)
11005 }
11006}
11007
11009ARMTargetLowering::EmitStructByval(MachineInstr &MI,
11010 MachineBasicBlock *BB) const {
11011 // This pseudo instruction has 3 operands: dst, src, size
11012 // We expand it to a loop if size > Subtarget->getMaxInlineSizeThreshold().
11013 // Otherwise, we will generate unrolled scalar copies.
11014 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
11015 const BasicBlock *LLVM_BB = BB->getBasicBlock();
11017
11018 Register dest = MI.getOperand(0).getReg();
11019 Register src = MI.getOperand(1).getReg();
11020 unsigned SizeVal = MI.getOperand(2).getImm();
11021 unsigned Alignment = MI.getOperand(3).getImm();
11022 DebugLoc dl = MI.getDebugLoc();
11023
11024 MachineFunction *MF = BB->getParent();
11026 unsigned UnitSize = 0;
11027 const TargetRegisterClass *TRC = nullptr;
11028 const TargetRegisterClass *VecTRC = nullptr;
11029
11030 bool IsThumb1 = Subtarget->isThumb1Only();
11031 bool IsThumb2 = Subtarget->isThumb2();
11032 bool IsThumb = Subtarget->isThumb();
11033
11034 if (Alignment & 1) {
11035 UnitSize = 1;
11036 } else if (Alignment & 2) {
11037 UnitSize = 2;
11038 } else {
11039 // Check whether we can use NEON instructions.
11040 if (!MF->getFunction().hasFnAttribute(Attribute::NoImplicitFloat) &&
11041 Subtarget->hasNEON()) {
11042 if ((Alignment % 16 == 0) && SizeVal >= 16)
11043 UnitSize = 16;
11044 else if ((Alignment % 8 == 0) && SizeVal >= 8)
11045 UnitSize = 8;
11046 }
11047 // Can't use NEON instructions.
11048 if (UnitSize == 0)
11049 UnitSize = 4;
11050 }
11051
11052 // Select the correct opcode and register class for unit size load/store
11053 bool IsNeon = UnitSize >= 8;
11054 TRC = IsThumb ? &ARM::tGPRRegClass : &ARM::GPRRegClass;
11055 if (IsNeon)
11056 VecTRC = UnitSize == 16 ? &ARM::DPairRegClass
11057 : UnitSize == 8 ? &ARM::DPRRegClass
11058 : nullptr;
11059
11060 unsigned BytesLeft = SizeVal % UnitSize;
11061 unsigned LoopSize = SizeVal - BytesLeft;
11062
11063 if (SizeVal <= Subtarget->getMaxInlineSizeThreshold()) {
11064 // Use LDR and STR to copy.
11065 // [scratch, srcOut] = LDR_POST(srcIn, UnitSize)
11066 // [destOut] = STR_POST(scratch, destIn, UnitSize)
11067 unsigned srcIn = src;
11068 unsigned destIn = dest;
11069 for (unsigned i = 0; i < LoopSize; i+=UnitSize) {
11070 Register srcOut = MRI.createVirtualRegister(TRC);
11071 Register destOut = MRI.createVirtualRegister(TRC);
11072 Register scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);
11073 emitPostLd(BB, MI, TII, dl, UnitSize, scratch, srcIn, srcOut,
11075 emitPostSt(BB, MI, TII, dl, UnitSize, scratch, destIn, destOut,
11077 srcIn = srcOut;
11078 destIn = destOut;
11079 }
11080
11081 // Handle the leftover bytes with LDRB and STRB.
11082 // [scratch, srcOut] = LDRB_POST(srcIn, 1)
11083 // [destOut] = STRB_POST(scratch, destIn, 1)
11084 for (unsigned i = 0; i < BytesLeft; i++) {
11085 Register srcOut = MRI.createVirtualRegister(TRC);
11086 Register destOut = MRI.createVirtualRegister(TRC);
11087 Register scratch = MRI.createVirtualRegister(TRC);
11088 emitPostLd(BB, MI, TII, dl, 1, scratch, srcIn, srcOut,
11090 emitPostSt(BB, MI, TII, dl, 1, scratch, destIn, destOut,
11092 srcIn = srcOut;
11093 destIn = destOut;
11094 }
11095 MI.eraseFromParent(); // The instruction is gone now.
11096 return BB;
11097 }
11098
11099 // Expand the pseudo op to a loop.
11100 // thisMBB:
11101 // ...
11102 // movw varEnd, # --> with thumb2
11103 // movt varEnd, #
11104 // ldrcp varEnd, idx --> without thumb2
11105 // fallthrough --> loopMBB
11106 // loopMBB:
11107 // PHI varPhi, varEnd, varLoop
11108 // PHI srcPhi, src, srcLoop
11109 // PHI destPhi, dst, destLoop
11110 // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
11111 // [destLoop] = STR_POST(scratch, destPhi, UnitSize)
11112 // subs varLoop, varPhi, #UnitSize
11113 // bne loopMBB
11114 // fallthrough --> exitMBB
11115 // exitMBB:
11116 // epilogue to handle left-over bytes
11117 // [scratch, srcOut] = LDRB_POST(srcLoop, 1)
11118 // [destOut] = STRB_POST(scratch, destLoop, 1)
11121 MF->insert(It, loopMBB);
11122 MF->insert(It, exitMBB);
11123
11124 // Transfer the remainder of BB and its successor edges to exitMBB.
11125 exitMBB->splice(exitMBB->begin(), BB,
11126 std::next(MachineBasicBlock::iterator(MI)), BB->end());
11127 exitMBB->transferSuccessorsAndUpdatePHIs(BB);
11128
11129 // Load an immediate to varEnd.
11130 Register varEnd = MRI.createVirtualRegister(TRC);
11131 if (Subtarget->useMovt()) {
11132 unsigned Vtmp = varEnd;
11133 if ((LoopSize & 0xFFFF0000) != 0)
11134 Vtmp = MRI.createVirtualRegister(TRC);
11135 BuildMI(BB, dl, TII->get(IsThumb ? ARM::t2MOVi16 : ARM::MOVi16), Vtmp)
11136 .addImm(LoopSize & 0xFFFF)
11137 .add(predOps(ARMCC::AL));
11138
11139 if ((LoopSize & 0xFFFF0000) != 0)
11140 BuildMI(BB, dl, TII->get(IsThumb ? ARM::t2MOVTi16 : ARM::MOVTi16), varEnd)
11141 .addReg(Vtmp)
11142 .addImm(LoopSize >> 16)
11143 .add(predOps(ARMCC::AL));
11144 } else {
11147 const Constant *C = ConstantInt::get(Int32Ty, LoopSize);
11148
11149 // MachineConstantPool wants an explicit alignment.
11150 Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty);
11151 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment);
11155
11156 if (IsThumb)
11157 BuildMI(*BB, MI, dl, TII->get(ARM::tLDRpci))
11162 else
11163 BuildMI(*BB, MI, dl, TII->get(ARM::LDRcp))
11166 .addImm(0)
11169 }
11170 BB->addSuccessor(loopMBB);
11171
11172 // Generate the loop body:
11173 // varPhi = PHI(varLoop, varEnd)
11174 // srcPhi = PHI(srcLoop, src)
11175 // destPhi = PHI(destLoop, dst)
11177 BB = loopMBB;
11178 Register varLoop = MRI.createVirtualRegister(TRC);
11179 Register varPhi = MRI.createVirtualRegister(TRC);
11180 Register srcLoop = MRI.createVirtualRegister(TRC);
11181 Register srcPhi = MRI.createVirtualRegister(TRC);
11182 Register destLoop = MRI.createVirtualRegister(TRC);
11183 Register destPhi = MRI.createVirtualRegister(TRC);
11184
11185 BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), varPhi)
11188 BuildMI(BB, dl, TII->get(ARM::PHI), srcPhi)
11190 .addReg(src).addMBB(entryBB);
11191 BuildMI(BB, dl, TII->get(ARM::PHI), destPhi)
11194
11195 // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
11196 // [destLoop] = STR_POST(scratch, destPhi, UnitSiz)
11197 Register scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);
11198 emitPostLd(BB, BB->end(), TII, dl, UnitSize, scratch, srcPhi, srcLoop,
11200 emitPostSt(BB, BB->end(), TII, dl, UnitSize, scratch, destPhi, destLoop,
11202
11203 // Decrement loop variable by UnitSize.
11204 if (IsThumb1) {
11205 BuildMI(*BB, BB->end(), dl, TII->get(ARM::tSUBi8), varLoop)
11206 .add(t1CondCodeOp())
11207 .addReg(varPhi)
11208 .addImm(UnitSize)
11210 } else {
11212 BuildMI(*BB, BB->end(), dl,
11213 TII->get(IsThumb2 ? ARM::t2SUBri : ARM::SUBri), varLoop);
11214 MIB.addReg(varPhi)
11215 .addImm(UnitSize)
11217 .add(condCodeOp());
11218 MIB->getOperand(5).setReg(ARM::CPSR);
11219 MIB->getOperand(5).setIsDef(true);
11220 }
11221 BuildMI(*BB, BB->end(), dl,
11222 TII->get(IsThumb1 ? ARM::tBcc : IsThumb2 ? ARM::t2Bcc : ARM::Bcc))
11223 .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
11224
11225 // loopMBB can loop back to loopMBB or fall through to exitMBB.
11226 BB->addSuccessor(loopMBB);
11227 BB->addSuccessor(exitMBB);
11228
11229 // Add epilogue to handle BytesLeft.
11230 BB = exitMBB;
11231 auto StartOfExit = exitMBB->begin();
11232
11233 // [scratch, srcOut] = LDRB_POST(srcLoop, 1)
11234 // [destOut] = STRB_POST(scratch, destLoop, 1)
11235 unsigned srcIn = srcLoop;
11236 unsigned destIn = destLoop;
11237 for (unsigned i = 0; i < BytesLeft; i++) {
11238 Register srcOut = MRI.createVirtualRegister(TRC);
11239 Register destOut = MRI.createVirtualRegister(TRC);
11240 Register scratch = MRI.createVirtualRegister(TRC);
11245 srcIn = srcOut;
11246 destIn = destOut;
11247 }
11248
11249 MI.eraseFromParent(); // The instruction is gone now.
11250 return BB;
11251}
11252
11254ARMTargetLowering::EmitLowered__chkstk(MachineInstr &MI,
11255 MachineBasicBlock *MBB) const {
11257 const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
11258 DebugLoc DL = MI.getDebugLoc();
11259
11260 assert(Subtarget->isTargetWindows() &&
11261 "__chkstk is only supported on Windows");
11262 assert(Subtarget->isThumb2() && "Windows on ARM requires Thumb-2 mode");
11263
11264 // __chkstk takes the number of words to allocate on the stack in R4, and
11265 // returns the stack adjustment in number of bytes in R4. This will not
11266 // clober any other registers (other than the obvious lr).
11267 //
11268 // Although, technically, IP should be considered a register which may be
11269 // clobbered, the call itself will not touch it. Windows on ARM is a pure
11270 // thumb-2 environment, so there is no interworking required. As a result, we
11271 // do not expect a veneer to be emitted by the linker, clobbering IP.
11272 //
11273 // Each module receives its own copy of __chkstk, so no import thunk is
11274 // required, again, ensuring that IP is not clobbered.
11275 //
11276 // Finally, although some linkers may theoretically provide a trampoline for
11277 // out of range calls (which is quite common due to a 32M range limitation of
11278 // branches for Thumb), we can generate the long-call version via
11279 // -mcmodel=large, alleviating the need for the trampoline which may clobber
11280 // IP.
11281
11282 switch (TM.getCodeModel()) {
11283 case CodeModel::Tiny:
11284 llvm_unreachable("Tiny code model not available on ARM.");
11285 case CodeModel::Small:
11286 case CodeModel::Medium:
11287 case CodeModel::Kernel:
11288 BuildMI(*MBB, MI, DL, TII.get(ARM::tBL))
11290 .addExternalSymbol("__chkstk")
11293 .addReg(ARM::R12,
11295 .addReg(ARM::CPSR,
11297 break;
11298 case CodeModel::Large: {
11300 Register Reg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11301
11302 BuildMI(*MBB, MI, DL, TII.get(ARM::t2MOVi32imm), Reg)
11303 .addExternalSymbol("__chkstk");
11306 .addReg(Reg, RegState::Kill)
11309 .addReg(ARM::R12,
11311 .addReg(ARM::CPSR,
11313 break;
11314 }
11315 }
11316
11317 BuildMI(*MBB, MI, DL, TII.get(ARM::t2SUBrr), ARM::SP)
11318 .addReg(ARM::SP, RegState::Kill)
11319 .addReg(ARM::R4, RegState::Kill)
11322 .add(condCodeOp());
11323
11324 MI.eraseFromParent();
11325 return MBB;
11326}
11327
11329ARMTargetLowering::EmitLowered__dbzchk(MachineInstr &MI,
11330 MachineBasicBlock *MBB) const {
11331 DebugLoc DL = MI.getDebugLoc();
11332 MachineFunction *MF = MBB->getParent();
11333 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
11334
11336 MF->insert(++MBB->getIterator(), ContBB);
11337 ContBB->splice(ContBB->begin(), MBB,
11338 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
11339 ContBB->transferSuccessorsAndUpdatePHIs(MBB);
11341
11343 BuildMI(TrapBB, DL, TII->get(ARM::t__brkdiv0));
11344 MF->push_back(TrapBB);
11346
11347 BuildMI(*MBB, MI, DL, TII->get(ARM::tCMPi8))
11348 .addReg(MI.getOperand(0).getReg())
11349 .addImm(0)
11351 BuildMI(*MBB, MI, DL, TII->get(ARM::t2Bcc))
11352 .addMBB(TrapBB)
11354 .addReg(ARM::CPSR);
11355
11356 MI.eraseFromParent();
11357 return ContBB;
11358}
11359
11360// The CPSR operand of SelectItr might be missing a kill marker
11361// because there were multiple uses of CPSR, and ISel didn't know
11362// which to mark. Figure out whether SelectItr should have had a
11363// kill marker, and set it if it should. Returns the correct kill
11364// marker value.
11367 const TargetRegisterInfo* TRI) {
11368 // Scan forward through BB for a use/def of CPSR.
11370 for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
11371 const MachineInstr& mi = *miI;
11372 if (mi.readsRegister(ARM::CPSR))
11373 return false;
11374 if (mi.definesRegister(ARM::CPSR))
11375 break; // Should have kill-flag - update below.
11376 }
11377
11378 // If we hit the end of the block, check whether CPSR is live into a
11379 // successor.
11380 if (miI == BB->end()) {
11382 sEnd = BB->succ_end();
11383 sItr != sEnd; ++sItr) {
11384 MachineBasicBlock* succ = *sItr;
11385 if (succ->isLiveIn(ARM::CPSR))
11386 return false;
11387 }
11388 }
11389
11390 // We found a def, or hit the end of the basic block and CPSR wasn't live
11391 // out. SelectMI should have a kill flag on CPSR.
11392 SelectItr->addRegisterKilled(ARM::CPSR, TRI);
11393 return true;
11394}
11395
11396/// Adds logic in loop entry MBB to calculate loop iteration count and adds
11397/// t2WhileLoopSetup and t2WhileLoopStart to generate WLS loop
11403 // Calculates loop iteration count = ceil(n/16) = (n + 15) >> 4.
11404 Register AddDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11405 BuildMI(TpEntry, Dl, TII->get(ARM::t2ADDri), AddDestReg)
11407 .addImm(15)
11409 .addReg(0);
11410
11411 Register LsrDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11412 BuildMI(TpEntry, Dl, TII->get(ARM::t2LSRri), LsrDestReg)
11414 .addImm(4)
11416 .addReg(0);
11417
11418 Register TotalIterationsReg = MRI.createVirtualRegister(&ARM::GPRlrRegClass);
11419 BuildMI(TpEntry, Dl, TII->get(ARM::t2WhileLoopSetup), TotalIterationsReg)
11421
11422 BuildMI(TpEntry, Dl, TII->get(ARM::t2WhileLoopStart))
11424 .addMBB(TpExit);
11425
11426 BuildMI(TpEntry, Dl, TII->get(ARM::t2B))
11429
11430 return TotalIterationsReg;
11431}
11432
11433/// Adds logic in the loopBody MBB to generate MVE_VCTP, t2DoLoopDec and
11434/// t2DoLoopEnd. These are used by later passes to generate tail predicated
11435/// loops.
11442 // First insert 4 PHI nodes for: Current pointer to Src (if memcpy), Dest
11443 // array, loop iteration counter, predication counter.
11444
11446 if (IsMemcpy) {
11447 // Current position in the src array
11448 SrcPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11449 CurrSrcReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11450 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), SrcPhiReg)
11452 .addMBB(TpEntry)
11455 }
11456
11457 // Current position in the dest array
11458 Register DestPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11459 Register CurrDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11460 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), DestPhiReg)
11462 .addMBB(TpEntry)
11465
11466 // Current loop counter
11467 Register LoopCounterPhiReg = MRI.createVirtualRegister(&ARM::GPRlrRegClass);
11469 MRI.createVirtualRegister(&ARM::GPRlrRegClass);
11470 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), LoopCounterPhiReg)
11472 .addMBB(TpEntry)
11475
11476 // Predication counter
11477 Register PredCounterPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11478 Register RemainingElementsReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11479 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), PredCounterPhiReg)
11481 .addMBB(TpEntry)
11484
11485 // Pass predication counter to VCTP
11486 Register VccrReg = MRI.createVirtualRegister(&ARM::VCCRRegClass);
11487 BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VCTP8), VccrReg)
11490 .addReg(0);
11491
11492 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2SUBri), RemainingElementsReg)
11494 .addImm(16)
11496 .addReg(0);
11497
11498 // VLDRB (only if memcpy) and VSTRB instructions, predicated using VPR
11500 if (IsMemcpy) {
11501 SrcValueReg = MRI.createVirtualRegister(&ARM::MQPRRegClass);
11502 BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VLDRBU8_post))
11506 .addImm(16)
11508 .addUse(VccrReg);
11509 } else
11511
11512 BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VSTRBU8_post))
11516 .addImm(16)
11518 .addUse(VccrReg);
11519
11520 // Add the pseudoInstrs for decrementing the loop counter and marking the
11521 // end:t2DoLoopDec and t2DoLoopEnd
11522 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2LoopDec), RemainingLoopIterationsReg)
11524 .addImm(1);
11525
11526 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2LoopEnd))
11529
11530 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2B))
11531 .addMBB(TpExit)
11533}
11534
11537 MachineBasicBlock *BB) const {
11538 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
11539 DebugLoc dl = MI.getDebugLoc();
11540 bool isThumb2 = Subtarget->isThumb2();
11541 switch (MI.getOpcode()) {
11542 default: {
11543 MI.print(errs());
11544 llvm_unreachable("Unexpected instr type to insert");
11545 }
11546
11547 // Thumb1 post-indexed loads are really just single-register LDMs.
11548 case ARM::tLDR_postidx: {
11549 MachineOperand Def(MI.getOperand(1));
11550 BuildMI(*BB, MI, dl, TII->get(ARM::tLDMIA_UPD))
11551 .add(Def) // Rn_wb
11552 .add(MI.getOperand(2)) // Rn
11553 .add(MI.getOperand(3)) // PredImm
11554 .add(MI.getOperand(4)) // PredReg
11555 .add(MI.getOperand(0)) // Rt
11556 .cloneMemRefs(MI);
11557 MI.eraseFromParent();
11558 return BB;
11559 }
11560
11561 case ARM::MVE_MEMCPYLOOPINST:
11562 case ARM::MVE_MEMSETLOOPINST: {
11563
11564 // Transformation below expands MVE_MEMCPYLOOPINST/MVE_MEMSETLOOPINST Pseudo
11565 // into a Tail Predicated (TP) Loop. It adds the instructions to calculate
11566 // the iteration count =ceil(size_in_bytes/16)) in the TP entry block and
11567 // adds the relevant instructions in the TP loop Body for generation of a
11568 // WLSTP loop.
11569
11570 // Below is relevant portion of the CFG after the transformation.
11571 // The Machine Basic Blocks are shown along with branch conditions (in
11572 // brackets). Note that TP entry/exit MBBs depict the entry/exit of this
11573 // portion of the CFG and may not necessarily be the entry/exit of the
11574 // function.
11575
11576 // (Relevant) CFG after transformation:
11577 // TP entry MBB
11578 // |
11579 // |-----------------|
11580 // (n <= 0) (n > 0)
11581 // | |
11582 // | TP loop Body MBB<--|
11583 // | | |
11584 // \ |___________|
11585 // \ /
11586 // TP exit MBB
11587
11588 MachineFunction *MF = BB->getParent();
11589 MachineFunctionProperties &Properties = MF->getProperties();
11591
11592 Register OpDestReg = MI.getOperand(0).getReg();
11593 Register OpSrcReg = MI.getOperand(1).getReg();
11594 Register OpSizeReg = MI.getOperand(2).getReg();
11595
11596 // Allocate the required MBBs and add to parent function.
11600
11601 MF->push_back(TpLoopBody);
11602
11603 // If any instructions are present in the current block after
11604 // MVE_MEMCPYLOOPINST or MVE_MEMSETLOOPINST, split the current block and
11605 // move the instructions into the newly created exit block. If there are no
11606 // instructions add an explicit branch to the FallThrough block and then
11607 // split.
11608 //
11609 // The split is required for two reasons:
11610 // 1) A terminator(t2WhileLoopStart) will be placed at that site.
11611 // 2) Since a TPLoopBody will be added later, any phis in successive blocks
11612 // need to be updated. splitAt() already handles this.
11613 TpExit = BB->splitAt(MI, false);
11614 if (TpExit == BB) {
11615 assert(BB->canFallThrough() && "Exit Block must be Fallthrough of the "
11616 "block containing memcpy/memset Pseudo");
11617 TpExit = BB->getFallThrough();
11618 BuildMI(BB, dl, TII->get(ARM::t2B))
11619 .addMBB(TpExit)
11621 TpExit = BB->splitAt(MI, false);
11622 }
11623
11624 // Add logic for iteration count
11627
11628 // Add the vectorized (and predicated) loads/store instructions
11629 bool IsMemcpy = MI.getOpcode() == ARM::MVE_MEMCPYLOOPINST;
11632
11633 // Required to avoid conflict with the MachineVerifier during testing.
11635
11636 // Connect the blocks
11637 TpEntry->addSuccessor(TpLoopBody);
11638 TpLoopBody->addSuccessor(TpLoopBody);
11639 TpLoopBody->addSuccessor(TpExit);
11640
11641 // Reorder for a more natural layout
11642 TpLoopBody->moveAfter(TpEntry);
11643 TpExit->moveAfter(TpLoopBody);
11644
11645 // Finally, remove the memcpy Psuedo Instruction
11646 MI.eraseFromParent();
11647
11648 // Return the exit block as it may contain other instructions requiring a
11649 // custom inserter
11650 return TpExit;
11651 }
11652
11653 // The Thumb2 pre-indexed stores have the same MI operands, they just
11654 // define them differently in the .td files from the isel patterns, so
11655 // they need pseudos.
11656 case ARM::t2STR_preidx:
11657 MI.setDesc(TII->get(ARM::t2STR_PRE));
11658 return BB;
11659 case ARM::t2STRB_preidx:
11660 MI.setDesc(TII->get(ARM::t2STRB_PRE));
11661 return BB;
11662 case ARM::t2STRH_preidx:
11663 MI.setDesc(TII->get(ARM::t2STRH_PRE));
11664 return BB;
11665
11666 case ARM::STRi_preidx:
11667 case ARM::STRBi_preidx: {
11668 unsigned NewOpc = MI.getOpcode() == ARM::STRi_preidx ? ARM::STR_PRE_IMM
11669 : ARM::STRB_PRE_IMM;
11670 // Decode the offset.
11671 unsigned Offset = MI.getOperand(4).getImm();
11674 if (isSub)
11675 Offset = -Offset;
11676
11677 MachineMemOperand *MMO = *MI.memoperands_begin();
11678 BuildMI(*BB, MI, dl, TII->get(NewOpc))
11679 .add(MI.getOperand(0)) // Rn_wb
11680 .add(MI.getOperand(1)) // Rt
11681 .add(MI.getOperand(2)) // Rn
11682 .addImm(Offset) // offset (skip GPR==zero_reg)
11683 .add(MI.getOperand(5)) // pred
11684 .add(MI.getOperand(6))
11685 .addMemOperand(MMO);
11686 MI.eraseFromParent();
11687 return BB;
11688 }
11689 case ARM::STRr_preidx:
11690 case ARM::STRBr_preidx:
11691 case ARM::STRH_preidx: {
11692 unsigned NewOpc;
11693 switch (MI.getOpcode()) {
11694 default: llvm_unreachable("unexpected opcode!");
11695 case ARM::STRr_preidx: NewOpc = ARM::STR_PRE_REG; break;
11696 case ARM::STRBr_preidx: NewOpc = ARM::STRB_PRE_REG; break;
11697 case ARM::STRH_preidx: NewOpc = ARM::STRH_PRE; break;
11698 }
11699 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(NewOpc));
11700 for (unsigned i = 0; i < MI.getNumOperands(); ++i)
11701 MIB.add(MI.getOperand(i));
11702 MI.eraseFromParent();
11703 return BB;
11704 }
11705
11706 case ARM::tMOVCCr_pseudo: {
11707 // To "insert" a SELECT_CC instruction, we actually have to insert the
11708 // diamond control-flow pattern. The incoming instruction knows the
11709 // destination vreg to set, the condition code register to branch on, the
11710 // true/false values to select between, and a branch opcode to use.
11711 const BasicBlock *LLVM_BB = BB->getBasicBlock();
11713
11714 // thisMBB:
11715 // ...
11716 // TrueVal = ...
11717 // cmpTY ccX, r1, r2
11718 // bCC copy1MBB
11719 // fallthrough --> copy0MBB
11721 MachineFunction *F = BB->getParent();
11722 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
11723 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
11724 F->insert(It, copy0MBB);
11725 F->insert(It, sinkMBB);
11726
11727 // Check whether CPSR is live past the tMOVCCr_pseudo.
11728 const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
11729 if (!MI.killsRegister(ARM::CPSR) &&
11731 copy0MBB->addLiveIn(ARM::CPSR);
11732 sinkMBB->addLiveIn(ARM::CPSR);
11733 }
11734
11735 // Transfer the remainder of BB and its successor edges to sinkMBB.
11736 sinkMBB->splice(sinkMBB->begin(), BB,
11737 std::next(MachineBasicBlock::iterator(MI)), BB->end());
11738 sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
11739
11741 BB->addSuccessor(sinkMBB);
11742
11743 BuildMI(BB, dl, TII->get(ARM::tBcc))
11744 .addMBB(sinkMBB)
11745 .addImm(MI.getOperand(3).getImm())
11746 .addReg(MI.getOperand(4).getReg());
11747
11748 // copy0MBB:
11749 // %FalseValue = ...
11750 // # fallthrough to sinkMBB
11751 BB = copy0MBB;
11752
11753 // Update machine-CFG edges
11754 BB->addSuccessor(sinkMBB);
11755
11756 // sinkMBB:
11757 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
11758 // ...
11759 BB = sinkMBB;
11760 BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), MI.getOperand(0).getReg())
11761 .addReg(MI.getOperand(1).getReg())
11763 .addReg(MI.getOperand(2).getReg())
11764 .addMBB(thisMBB);
11765
11766 MI.eraseFromParent(); // The pseudo instruction is gone now.
11767 return BB;
11768 }
11769
11770 case ARM::BCCi64:
11771 case ARM::BCCZi64: {
11772 // If there is an unconditional branch to the other successor, remove it.
11773 BB->erase(std::next(MachineBasicBlock::iterator(MI)), BB->end());
11774
11775 // Compare both parts that make up the double comparison separately for
11776 // equality.
11777 bool RHSisZero = MI.getOpcode() == ARM::BCCZi64;
11778
11779 Register LHS1 = MI.getOperand(1).getReg();
11780 Register LHS2 = MI.getOperand(2).getReg();
11781 if (RHSisZero) {
11782 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
11783 .addReg(LHS1)
11784 .addImm(0)
11786 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
11787 .addReg(LHS2).addImm(0)
11788 .addImm(ARMCC::EQ).addReg(ARM::CPSR);
11789 } else {
11790 Register RHS1 = MI.getOperand(3).getReg();
11791 Register RHS2 = MI.getOperand(4).getReg();
11792 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
11793 .addReg(LHS1)
11794 .addReg(RHS1)
11796 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
11798 .addImm(ARMCC::EQ).addReg(ARM::CPSR);
11799 }
11800
11801 MachineBasicBlock *destMBB = MI.getOperand(RHSisZero ? 3 : 5).getMBB();
11803 if (MI.getOperand(0).getImm() == ARMCC::NE)
11805
11806 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
11807 .addMBB(destMBB).addImm(ARMCC::EQ).addReg(ARM::CPSR);
11808 if (isThumb2)
11809 BuildMI(BB, dl, TII->get(ARM::t2B))
11810 .addMBB(exitMBB)
11812 else
11813 BuildMI(BB, dl, TII->get(ARM::B)) .addMBB(exitMBB);
11814
11815 MI.eraseFromParent(); // The pseudo instruction is gone now.
11816 return BB;
11817 }
11818
11819 case ARM::Int_eh_sjlj_setjmp:
11820 case ARM::Int_eh_sjlj_setjmp_nofp:
11821 case ARM::tInt_eh_sjlj_setjmp:
11822 case ARM::t2Int_eh_sjlj_setjmp:
11823 case ARM::t2Int_eh_sjlj_setjmp_nofp:
11824 return BB;
11825
11826 case ARM::Int_eh_sjlj_setup_dispatch:
11827 EmitSjLjDispatchBlock(MI, BB);
11828 return BB;
11829
11830 case ARM::ABS:
11831 case ARM::t2ABS: {
11832 // To insert an ABS instruction, we have to insert the
11833 // diamond control-flow pattern. The incoming instruction knows the
11834 // source vreg to test against 0, the destination vreg to set,
11835 // the condition code register to branch on, the
11836 // true/false values to select between, and a branch opcode to use.
11837 // It transforms
11838 // V1 = ABS V0
11839 // into
11840 // V2 = MOVS V0
11841 // BCC (branch to SinkBB if V0 >= 0)
11842 // RSBBB: V3 = RSBri V2, 0 (compute ABS if V2 < 0)
11843 // SinkBB: V1 = PHI(V2, V3)
11844 const BasicBlock *LLVM_BB = BB->getBasicBlock();
11846 MachineFunction *Fn = BB->getParent();
11849 Fn->insert(BBI, RSBBB);
11850 Fn->insert(BBI, SinkBB);
11851
11852 Register ABSSrcReg = MI.getOperand(1).getReg();
11853 Register ABSDstReg = MI.getOperand(0).getReg();
11854 bool ABSSrcKIll = MI.getOperand(1).isKill();
11855 bool isThumb2 = Subtarget->isThumb2();
11857 // In Thumb mode S must not be specified if source register is the SP or
11858 // PC and if destination register is the SP, so restrict register class
11859 Register NewRsbDstReg = MRI.createVirtualRegister(
11860 isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRRegClass);
11861
11862 // Transfer the remainder of BB and its successor edges to sinkMBB.
11863 SinkBB->splice(SinkBB->begin(), BB,
11864 std::next(MachineBasicBlock::iterator(MI)), BB->end());
11865 SinkBB->transferSuccessorsAndUpdatePHIs(BB);
11866
11867 BB->addSuccessor(RSBBB);
11868 BB->addSuccessor(SinkBB);
11869
11870 // fall through to SinkMBB
11871 RSBBB->addSuccessor(SinkBB);
11872
11873 // insert a cmp at the end of BB
11874 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
11876 .addImm(0)
11878
11879 // insert a bcc with opposite CC to ARMCC::MI at the end of BB
11880 BuildMI(BB, dl,
11881 TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)).addMBB(SinkBB)
11882 .addImm(ARMCC::getOppositeCondition(ARMCC::MI)).addReg(ARM::CPSR);
11883
11884 // insert rsbri in RSBBB
11885 // Note: BCC and rsbri will be converted into predicated rsbmi
11886 // by if-conversion pass
11887 BuildMI(*RSBBB, RSBBB->begin(), dl,
11888 TII->get(isThumb2 ? ARM::t2RSBri : ARM::RSBri), NewRsbDstReg)
11890 .addImm(0)
11892 .add(condCodeOp());
11893
11894 // insert PHI in SinkBB,
11895 // reuse ABSDstReg to not change uses of ABS instruction
11896 BuildMI(*SinkBB, SinkBB->begin(), dl,
11897 TII->get(ARM::PHI), ABSDstReg)
11899 .addReg(ABSSrcReg).addMBB(BB);
11900
11901 // remove ABS instruction
11902 MI.eraseFromParent();
11903
11904 // return last added BB
11905 return SinkBB;
11906 }
11907 case ARM::COPY_STRUCT_BYVAL_I32:
11908 ++NumLoopByVals;
11909 return EmitStructByval(MI, BB);
11910 case ARM::WIN__CHKSTK:
11911 return EmitLowered__chkstk(MI, BB);
11912 case ARM::WIN__DBZCHK:
11913 return EmitLowered__dbzchk(MI, BB);
11914 }
11915}
11916
11917/// Attaches vregs to MEMCPY that it will use as scratch registers
11918/// when it is expanded into LDM/STM. This is done as a post-isel lowering
11919/// instead of as a custom inserter because we need the use list from the SDNode.
11920static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget,
11921 MachineInstr &MI, const SDNode *Node) {
11922 bool isThumb1 = Subtarget->isThumb1Only();
11923
11924 DebugLoc DL = MI.getDebugLoc();
11925 MachineFunction *MF = MI.getParent()->getParent();
11927 MachineInstrBuilder MIB(*MF, MI);
11928
11929 // If the new dst/src is unused mark it as dead.
11930 if (!Node->hasAnyUseOfValue(0)) {
11931 MI.getOperand(0).setIsDead(true);
11932 }
11933 if (!Node->hasAnyUseOfValue(1)) {
11934 MI.getOperand(1).setIsDead(true);
11935 }
11936
11937 // The MEMCPY both defines and kills the scratch registers.
11938 for (unsigned I = 0; I != MI.getOperand(4).getImm(); ++I) {
11939 Register TmpReg = MRI.createVirtualRegister(isThumb1 ? &ARM::tGPRRegClass
11940 : &ARM::GPRRegClass);
11942 }
11943}
11944
11946 SDNode *Node) const {
11947 if (MI.getOpcode() == ARM::MEMCPY) {
11948 attachMEMCPYScratchRegs(Subtarget, MI, Node);
11949 return;
11950 }
11951
11952 const MCInstrDesc *MCID = &MI.getDesc();
11953 // Adjust potentially 's' setting instructions after isel, i.e. ADC, SBC, RSB,
11954 // RSC. Coming out of isel, they have an implicit CPSR def, but the optional
11955 // operand is still set to noreg. If needed, set the optional operand's
11956 // register to CPSR, and remove the redundant implicit def.
11957 //
11958 // e.g. ADCS (..., implicit-def CPSR) -> ADC (... opt:def CPSR).
11959
11960 // Rename pseudo opcodes.
11961 unsigned NewOpc = convertAddSubFlagsOpcode(MI.getOpcode());
11962 unsigned ccOutIdx;
11963 if (NewOpc) {
11964 const ARMBaseInstrInfo *TII = Subtarget->getInstrInfo();
11965 MCID = &TII->get(NewOpc);
11966
11967 assert(MCID->getNumOperands() ==
11968 MI.getDesc().getNumOperands() + 5 - MI.getDesc().getSize()
11969 && "converted opcode should be the same except for cc_out"
11970 " (and, on Thumb1, pred)");
11971
11972 MI.setDesc(*MCID);
11973
11974 // Add the optional cc_out operand
11975 MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/true));
11976
11977 // On Thumb1, move all input operands to the end, then add the predicate
11978 if (Subtarget->isThumb1Only()) {
11979 for (unsigned c = MCID->getNumOperands() - 4; c--;) {
11980 MI.addOperand(MI.getOperand(1));
11981 MI.RemoveOperand(1);
11982 }
11983
11984 // Restore the ties
11985 for (unsigned i = MI.getNumOperands(); i--;) {
11986 const MachineOperand& op = MI.getOperand(i);
11987 if (op.isReg() && op.isUse()) {
11988 int DefIdx = MCID->getOperandConstraint(i, MCOI::TIED_TO);
11989 if (DefIdx != -1)
11990 MI.tieOperands(DefIdx, i);
11991 }
11992 }
11993
11995 MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/false));
11996 ccOutIdx = 1;
11997 } else
11998 ccOutIdx = MCID->getNumOperands() - 1;
11999 } else
12000 ccOutIdx = MCID->getNumOperands() - 1;
12001
12002 // Any ARM instruction that sets the 's' bit should specify an optional
12003 // "cc_out" operand in the last operand position.
12004 if (!MI.hasOptionalDef() || !MCID->OpInfo[ccOutIdx].isOptionalDef()) {
12005 assert(!NewOpc && "Optional cc_out operand required");
12006 return;
12007 }
12008 // Look for an implicit def of CPSR added by MachineInstr ctor. Remove it
12009 // since we already have an optional CPSR def.
12010 bool definesCPSR = false;
12011 bool deadCPSR = false;
12012 for (unsigned i = MCID->getNumOperands(), e = MI.getNumOperands(); i != e;
12013 ++i) {
12014 const MachineOperand &MO = MI.getOperand(i);
12015 if (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR) {
12016 definesCPSR = true;
12017 if (MO.isDead())
12018 deadCPSR = true;
12019 MI.RemoveOperand(i);
12020 break;
12021 }
12022 }
12023 if (!definesCPSR) {
12024 assert(!NewOpc && "Optional cc_out operand required");
12025 return;
12026 }
12027 assert(deadCPSR == !Node->hasAnyUseOfValue(1) && "inconsistent dead flag");
12028 if (deadCPSR) {
12029 assert(!MI.getOperand(ccOutIdx).getReg() &&
12030 "expect uninitialized optional cc_out operand");
12031 // Thumb1 instructions must have the S bit even if the CPSR is dead.
12032 if (!Subtarget->isThumb1Only())
12033 return;
12034 }
12035
12036 // If this instruction was defined with an optional CPSR def and its dag node
12037 // had a live implicit CPSR def, then activate the optional CPSR def.
12038 MachineOperand &MO = MI.getOperand(ccOutIdx);
12039 MO.setReg(ARM::CPSR);
12040 MO.setIsDef(true);
12041}
12042
12043//===----------------------------------------------------------------------===//
12044// ARM Optimization Hooks
12045//===----------------------------------------------------------------------===//
12046
12047// Helper function that checks if N is a null or all ones constant.
12048static inline bool isZeroOrAllOnes(SDValue N, bool AllOnes) {
12049 return AllOnes ? isAllOnesConstant(N) : isNullConstant(N);
12050}
12051
12052// Return true if N is conditionally 0 or all ones.
12053// Detects these expressions where cc is an i1 value:
12054//
12055// (select cc 0, y) [AllOnes=0]
12056// (select cc y, 0) [AllOnes=0]
12057// (zext cc) [AllOnes=0]
12058// (sext cc) [AllOnes=0/1]
12059// (select cc -1, y) [AllOnes=1]
12060// (select cc y, -1) [AllOnes=1]
12061//
12062// Invert is set when N is the null/all ones constant when CC is false.
12063// OtherOp is set to the alternative value of N.
12064static bool isConditionalZeroOrAllOnes(SDNode *N, bool AllOnes,
12065 SDValue &CC, bool &Invert,
12066 SDValue &OtherOp,
12067 SelectionDAG &DAG) {
12068 switch (N->getOpcode()) {
12069 default: return false;
12070 case ISD::SELECT: {
12071 CC = N->getOperand(0);
12072 SDValue N1 = N->getOperand(1);
12073 SDValue N2 = N->getOperand(2);
12074 if (isZeroOrAllOnes(N1, AllOnes)) {
12075 Invert = false;
12076 OtherOp = N2;
12077 return true;
12078 }
12079 if (isZeroOrAllOnes(N2, AllOnes)) {
12080 Invert = true;
12081 OtherOp = N1;
12082 return true;
12083 }
12084 return false;
12085 }
12086 case ISD::ZERO_EXTEND:
12087 // (zext cc) can never be the all ones value.
12088 if (AllOnes)
12089 return false;
12091 case ISD::SIGN_EXTEND: {
12092 SDLoc dl(N);
12093 EVT VT = N->getValueType(0);
12094 CC = N->getOperand(0);
12095 if (CC.getValueType() != MVT::i1 || CC.getOpcode() != ISD::SETCC)
12096 return false;
12097 Invert = !AllOnes;
12098 if (AllOnes)
12099 // When looking for an AllOnes constant, N is an sext, and the 'other'
12100 // value is 0.
12101 OtherOp = DAG.getConstant(0, dl, VT);
12102 else if (N->getOpcode() == ISD::ZERO_EXTEND)
12103 // When looking for a 0 constant, N can be zext or sext.
12104 OtherOp = DAG.getConstant(1, dl, VT);
12105 else
12106 OtherOp = DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), dl,
12107 VT);
12108 return true;
12109 }
12110 }
12111}
12112
12113// Combine a constant select operand into its use:
12114//
12115// (add (select cc, 0, c), x) -> (select cc, x, (add, x, c))
12116// (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c))
12117// (and (select cc, -1, c), x) -> (select cc, x, (and, x, c)) [AllOnes=1]
12118// (or (select cc, 0, c), x) -> (select cc, x, (or, x, c))
12119// (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c))
12120//
12121// The transform is rejected if the select doesn't have a constant operand that
12122// is null, or all ones when AllOnes is set.
12123//
12124// Also recognize sext/zext from i1:
12125//
12126// (add (zext cc), x) -> (select cc (add x, 1), x)
12127// (add (sext cc), x) -> (select cc (add x, -1), x)
12128//
12129// These transformations eventually create predicated instructions.
12130//
12131// @param N The node to transform.
12132// @param Slct The N operand that is a select.
12133// @param OtherOp The other N operand (x above).
12134// @param DCI Context.
12135// @param AllOnes Require the select constant to be all ones instead of null.
12136// @returns The new node, or SDValue() on failure.
12137static
12140 bool AllOnes = false) {
12141 SelectionDAG &DAG = DCI.DAG;
12142 EVT VT = N->getValueType(0);
12144 SDValue CCOp;
12145 bool SwapSelectOps;
12146 if (!isConditionalZeroOrAllOnes(Slct.getNode(), AllOnes, CCOp, SwapSelectOps,
12147 NonConstantVal, DAG))
12148 return SDValue();
12149
12150 // Slct is now know to be the desired identity constant when CC is true.
12151 SDValue TrueVal = OtherOp;
12152 SDValue FalseVal = DAG.getNode(N->getOpcode(), SDLoc(N), VT,
12153 OtherOp, NonConstantVal);
12154 // Unless SwapSelectOps says CC should be false.
12155 if (SwapSelectOps)
12156 std::swap(TrueVal, FalseVal);
12157
12158 return DAG.getNode(ISD::SELECT, SDLoc(N), VT,
12159 CCOp, TrueVal, FalseVal);
12160}
12161
12162// Attempt combineSelectAndUse on each operand of a commutative operator N.
12163static
12166 SDValue N0 = N->getOperand(0);
12167 SDValue N1 = N->getOperand(1);
12168 if (N0.getNode()->hasOneUse())
12169 if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI, AllOnes))
12170 return Result;
12171 if (N1.getNode()->hasOneUse())
12172 if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI, AllOnes))
12173 return Result;
12174 return SDValue();
12175}
12176
12178 // VUZP shuffle node.
12179 if (N->getOpcode() == ARMISD::VUZP)
12180 return true;
12181
12182 // "VUZP" on i32 is an alias for VTRN.
12183 if (N->getOpcode() == ARMISD::VTRN && N->getValueType(0) == MVT::v2i32)
12184 return true;
12185
12186 return false;
12187}
12188
12191 const ARMSubtarget *Subtarget) {
12192 // Look for ADD(VUZP.0, VUZP.1).
12193 if (!IsVUZPShuffleNode(N0.getNode()) || N0.getNode() != N1.getNode() ||
12194 N0 == N1)
12195 return SDValue();
12196
12197 // Make sure the ADD is a 64-bit add; there is no 128-bit VPADD.
12198 if (!N->getValueType(0).is64BitVector())
12199 return SDValue();
12200
12201 // Generate vpadd.
12202 SelectionDAG &DAG = DCI.DAG;
12203 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
12204 SDLoc dl(N);
12205 SDNode *Unzip = N0.getNode();
12206 EVT VT = N->getValueType(0);
12207
12209 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpadd, dl,
12210 TLI.getPointerTy(DAG.getDataLayout())));
12211 Ops.push_back(Unzip->getOperand(0));
12212 Ops.push_back(Unzip->getOperand(1));
12213
12214 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops);
12215}
12216
12219 const ARMSubtarget *Subtarget) {
12220 // Check for two extended operands.
12221 if (!(N0.getOpcode() == ISD::SIGN_EXTEND &&
12222 N1.getOpcode() == ISD::SIGN_EXTEND) &&
12223 !(N0.getOpcode() == ISD::ZERO_EXTEND &&
12224 N1.getOpcode() == ISD::ZERO_EXTEND))
12225 return SDValue();
12226
12227 SDValue N00 = N0.getOperand(0);
12228 SDValue N10 = N1.getOperand(0);
12229
12230 // Look for ADD(SEXT(VUZP.0), SEXT(VUZP.1))
12231 if (!IsVUZPShuffleNode(N00.getNode()) || N00.getNode() != N10.getNode() ||
12232 N00 == N10)
12233 return SDValue();
12234
12235 // We only recognize Q register paddl here; this can't be reached until
12236 // after type legalization.
12237 if (!N00.getValueType().is64BitVector() ||
12239 return SDValue();
12240
12241 // Generate vpaddl.
12242 SelectionDAG &DAG = DCI.DAG;
12243 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
12244 SDLoc dl(N);
12245 EVT VT = N->getValueType(0);
12246
12248 // Form vpaddl.sN or vpaddl.uN depending on the kind of extension.
12249 unsigned Opcode;
12250 if (N0.getOpcode() == ISD::SIGN_EXTEND)
12251 Opcode = Intrinsic::arm_neon_vpaddls;
12252 else
12253 Opcode = Intrinsic::arm_neon_vpaddlu;
12254 Ops.push_back(DAG.getConstant(Opcode, dl,
12255 TLI.getPointerTy(DAG.getDataLayout())));
12256 EVT ElemTy = N00.getValueType().getVectorElementType();
12257 unsigned NumElts = VT.getVectorNumElements();
12258 EVT ConcatVT = EVT::getVectorVT(*DAG.getContext(), ElemTy, NumElts * 2);
12260 N00.getOperand(0), N00.getOperand(1));
12261 Ops.push_back(Concat);
12262
12263 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops);
12264}
12265
12266// FIXME: This function shouldn't be necessary; if we lower BUILD_VECTOR in
12267// an appropriate manner, we end up with ADD(VUZP(ZEXT(N))), which is
12268// much easier to match.
12269static SDValue
12272 const ARMSubtarget *Subtarget) {
12273 // Only perform optimization if after legalize, and if NEON is available. We
12274 // also expected both operands to be BUILD_VECTORs.
12275 if (DCI.isBeforeLegalize() || !Subtarget->hasNEON()
12276 || N0.getOpcode() != ISD::BUILD_VECTOR
12277 || N1.getOpcode() != ISD::BUILD_VECTOR)
12278 return SDValue();
12279
12280 // Check output type since VPADDL operand elements can only be 8, 16, or 32.
12281 EVT VT = N->getValueType(0);
12282 if (!VT.isInteger() || VT.getVectorElementType() == MVT::i64)
12283 return SDValue();
12284
12285 // Check that the vector operands are of the right form.
12286 // N0 and N1 are BUILD_VECTOR nodes with N number of EXTRACT_VECTOR
12287 // operands, where N is the size of the formed vector.
12288 // Each EXTRACT_VECTOR should have the same input vector and odd or even
12289 // index such that we have a pair wise add pattern.
12290
12291 // Grab the vector that all EXTRACT_VECTOR nodes should be referencing.
12293 return SDValue();
12294 SDValue Vec = N0->getOperand(0)->getOperand(0);
12295 SDNode *V = Vec.getNode();
12296 unsigned nextIndex = 0;
12297
12298 // For each operands to the ADD which are BUILD_VECTORs,
12299 // check to see if each of their operands are an EXTRACT_VECTOR with
12300 // the same vector and appropriate index.
12301 for (unsigned i = 0, e = N0->getNumOperands(); i != e; ++i) {
12303 && N1->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
12304
12305 SDValue ExtVec0 = N0->getOperand(i);
12306 SDValue ExtVec1 = N1->getOperand(i);
12307
12308 // First operand is the vector, verify its the same.
12309 if (V != ExtVec0->getOperand(0).getNode() ||
12310 V != ExtVec1->getOperand(0).getNode())
12311 return SDValue();
12312
12313 // Second is the constant, verify its correct.
12316
12317 // For the constant, we want to see all the even or all the odd.
12318 if (!C0 || !C1 || C0->getZExtValue() != nextIndex
12319 || C1->getZExtValue() != nextIndex+1)
12320 return SDValue();
12321
12322 // Increment index.
12323 nextIndex+=2;
12324 } else
12325 return SDValue();
12326 }
12327
12328 // Don't generate vpaddl+vmovn; we'll match it to vpadd later. Also make sure
12329 // we're using the entire input vector, otherwise there's a size/legality
12330 // mismatch somewhere.
12333 return SDValue();
12334
12335 // Create VPADDL node.
12336 SelectionDAG &DAG = DCI.DAG;
12337 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
12338
12339 SDLoc dl(N);
12340
12341 // Build operand list.
12343 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddls, dl,
12344 TLI.getPointerTy(DAG.getDataLayout())));
12345
12346 // Input is the vector.
12347 Ops.push_back(Vec);
12348
12349 // Get widened type and narrowed type.
12350 MVT widenType;
12351 unsigned numElem = VT.getVectorNumElements();
12352
12354 switch (inputLaneType.getSimpleVT().SimpleTy) {
12358 default:
12359 llvm_unreachable("Invalid vector element type for padd optimization.");
12360 }
12361
12363 unsigned ExtOp = VT.bitsGT(tmp.getValueType()) ? ISD::ANY_EXTEND : ISD::TRUNCATE;
12364 return DAG.getNode(ExtOp, dl, VT, tmp);
12365}
12366
12368 if (V->getOpcode() == ISD::UMUL_LOHI ||
12369 V->getOpcode() == ISD::SMUL_LOHI)
12370 return V;
12371 return SDValue();
12372}
12373
12376 const ARMSubtarget *Subtarget) {
12377 if (!Subtarget->hasBaseDSP())
12378 return SDValue();
12379
12380 // SMLALBB, SMLALBT, SMLALTB, SMLALTT multiply two 16-bit values and
12381 // accumulates the product into a 64-bit value. The 16-bit values will
12382 // be sign extended somehow or SRA'd into 32-bit values
12383 // (addc (adde (mul 16bit, 16bit), lo), hi)
12384 SDValue Mul = AddcNode->getOperand(0);
12385 SDValue Lo = AddcNode->getOperand(1);
12386 if (Mul.getOpcode() != ISD::MUL) {
12387 Lo = AddcNode->getOperand(0);
12388 Mul = AddcNode->getOperand(1);
12389 if (Mul.getOpcode() != ISD::MUL)
12390 return SDValue();
12391 }
12392
12393 SDValue SRA = AddeNode->getOperand(0);
12394 SDValue Hi = AddeNode->getOperand(1);
12395 if (SRA.getOpcode() != ISD::SRA) {
12396 SRA = AddeNode->getOperand(1);
12397 Hi = AddeNode->getOperand(0);
12398 if (SRA.getOpcode() != ISD::SRA)
12399 return SDValue();
12400 }
12401 if (auto Const = dyn_cast<ConstantSDNode>(SRA.getOperand(1))) {
12402 if (Const->getZExtValue() != 31)
12403 return SDValue();
12404 } else
12405 return SDValue();
12406
12407 if (SRA.getOperand(0) != Mul)
12408 return SDValue();
12409
12410 SelectionDAG &DAG = DCI.DAG;
12411 SDLoc dl(AddcNode);
12412 unsigned Opcode = 0;
12413 SDValue Op0;
12414 SDValue Op1;
12415
12416 if (isS16(Mul.getOperand(0), DAG) && isS16(Mul.getOperand(1), DAG)) {
12417 Opcode = ARMISD::SMLALBB;
12418 Op0 = Mul.getOperand(0);
12419 Op1 = Mul.getOperand(1);
12420 } else if (isS16(Mul.getOperand(0), DAG) && isSRA16(Mul.getOperand(1))) {
12421 Opcode = ARMISD::SMLALBT;
12422 Op0 = Mul.getOperand(0);
12423 Op1 = Mul.getOperand(1).getOperand(0);
12424 } else if (isSRA16(Mul.getOperand(0)) && isS16(Mul.getOperand(1), DAG)) {
12425 Opcode = ARMISD::SMLALTB;
12426 Op0 = Mul.getOperand(0).getOperand(0);
12427 Op1 = Mul.getOperand(1);
12428 } else if (isSRA16(Mul.getOperand(0)) && isSRA16(Mul.getOperand(1))) {
12429 Opcode = ARMISD::SMLALTT;
12430 Op0 = Mul->getOperand(0).getOperand(0);
12431 Op1 = Mul->getOperand(1).getOperand(0);
12432 }
12433
12434 if (!Op0 || !Op1)
12435 return SDValue();
12436
12437 SDValue SMLAL = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32),
12438 Op0, Op1, Lo, Hi);
12439 // Replace the ADDs' nodes uses by the MLA node's values.
12440 SDValue HiMLALResult(SMLAL.getNode(), 1);
12441 SDValue LoMLALResult(SMLAL.getNode(), 0);
12442
12445
12446 // Return original node to notify the driver to stop replacing.
12448 return resNode;
12449}
12450
12453 const ARMSubtarget *Subtarget) {
12454 // Look for multiply add opportunities.
12455 // The pattern is a ISD::UMUL_LOHI followed by two add nodes, where
12456 // each add nodes consumes a value from ISD::UMUL_LOHI and there is
12457 // a glue link from the first add to the second add.
12458 // If we find this pattern, we can replace the U/SMUL_LOHI, ADDC, and ADDE by
12459 // a S/UMLAL instruction.
12460 // UMUL_LOHI
12461 // / :lo \ :hi
12462 // V \ [no multiline comment]
12463 // loAdd -> ADDC |
12464 // \ :carry /
12465 // V V
12466 // ADDE <- hiAdd
12467 //
12468 // In the special case where only the higher part of a signed result is used
12469 // and the add to the low part of the result of ISD::UMUL_LOHI adds or subtracts
12470 // a constant with the exact value of 0x80000000, we recognize we are dealing
12471 // with a "rounded multiply and add" (or subtract) and transform it into
12472 // either a ARMISD::SMMLAR or ARMISD::SMMLSR respectively.
12473
12474 assert((AddeSubeNode->getOpcode() == ARMISD::ADDE ||
12475 AddeSubeNode->getOpcode() == ARMISD::SUBE) &&
12476 "Expect an ADDE or SUBE");
12477
12478 assert(AddeSubeNode->getNumOperands() == 3 &&
12479 AddeSubeNode->getOperand(2).getValueType() == MVT::i32 &&
12480 "ADDE node has the wrong inputs");
12481
12482 // Check that we are chained to the right ADDC or SUBC node.
12483 SDNode *AddcSubcNode = AddeSubeNode->getOperand(2).getNode();
12484 if ((AddeSubeNode->getOpcode() == ARMISD::ADDE &&
12485 AddcSubcNode->getOpcode() != ARMISD::ADDC) ||
12486 (AddeSubeNode->getOpcode() == ARMISD::SUBE &&
12487 AddcSubcNode->getOpcode() != ARMISD::SUBC))
12488 return SDValue();
12489
12490 SDValue AddcSubcOp0 = AddcSubcNode->getOperand(0);
12491 SDValue AddcSubcOp1 = AddcSubcNode->getOperand(1);
12492
12493 // Check if the two operands are from the same mul_lohi node.
12494 if (AddcSubcOp0.getNode() == AddcSubcOp1.getNode())
12495 return SDValue();
12496
12497 assert(AddcSubcNode->getNumValues() == 2 &&
12498 AddcSubcNode->getValueType(0) == MVT::i32 &&
12499 "Expect ADDC with two result values. First: i32");
12500
12501 // Check that the ADDC adds the low result of the S/UMUL_LOHI. If not, it
12502 // maybe a SMLAL which multiplies two 16-bit values.
12503 if (AddeSubeNode->getOpcode() == ARMISD::ADDE &&
12504 AddcSubcOp0->getOpcode() != ISD::UMUL_LOHI &&
12505 AddcSubcOp0->getOpcode() != ISD::SMUL_LOHI &&
12506 AddcSubcOp1->getOpcode() != ISD::UMUL_LOHI &&
12507 AddcSubcOp1->getOpcode() != ISD::SMUL_LOHI)
12509
12510 // Check for the triangle shape.
12511 SDValue AddeSubeOp0 = AddeSubeNode->getOperand(0);
12512 SDValue AddeSubeOp1 = AddeSubeNode->getOperand(1);
12513
12514 // Make sure that the ADDE/SUBE operands are not coming from the same node.
12515 if (AddeSubeOp0.getNode() == AddeSubeOp1.getNode())
12516 return SDValue();
12517
12518 // Find the MUL_LOHI node walking up ADDE/SUBE's operands.
12519 bool IsLeftOperandMUL = false;
12521 if (MULOp == SDValue())
12523 else
12524 IsLeftOperandMUL = true;
12525 if (MULOp == SDValue())
12526 return SDValue();
12527
12528 // Figure out the right opcode.
12529 unsigned Opc = MULOp->getOpcode();
12530 unsigned FinalOpc = (Opc == ISD::SMUL_LOHI) ? ARMISD::SMLAL : ARMISD::UMLAL;
12531
12532 // Figure out the high and low input values to the MLAL node.
12533 SDValue *HiAddSub = nullptr;
12534 SDValue *LoMul = nullptr;
12535 SDValue *LowAddSub = nullptr;
12536
12537 // Ensure that ADDE/SUBE is from high result of ISD::xMUL_LOHI.
12538 if ((AddeSubeOp0 != MULOp.getValue(1)) && (AddeSubeOp1 != MULOp.getValue(1)))
12539 return SDValue();
12540
12541 if (IsLeftOperandMUL)
12543 else
12545
12546 // Ensure that LoMul and LowAddSub are taken from correct ISD::SMUL_LOHI node
12547 // whose low result is fed to the ADDC/SUBC we are checking.
12548
12549 if (AddcSubcOp0 == MULOp.getValue(0)) {
12550 LoMul = &AddcSubcOp0;
12552 }
12553 if (AddcSubcOp1 == MULOp.getValue(0)) {
12554 LoMul = &AddcSubcOp1;
12556 }
12557
12558 if (!LoMul)
12559 return SDValue();
12560
12561 // If HiAddSub is the same node as ADDC/SUBC or is a predecessor of ADDC/SUBC
12562 // the replacement below will create a cycle.
12563 if (AddcSubcNode == HiAddSub->getNode() ||
12564 AddcSubcNode->isPredecessorOf(HiAddSub->getNode()))
12565 return SDValue();
12566
12567 // Create the merged node.
12568 SelectionDAG &DAG = DCI.DAG;
12569
12570 // Start building operand list.
12572 Ops.push_back(LoMul->getOperand(0));
12573 Ops.push_back(LoMul->getOperand(1));
12574
12575 // Check whether we can use SMMLAR, SMMLSR or SMMULR instead. For this to be
12576 // the case, we must be doing signed multiplication and only use the higher
12577 // part of the result of the MLAL, furthermore the LowAddSub must be a constant
12578 // addition or subtraction with the value of 0x800000.
12579 if (Subtarget->hasV6Ops() && Subtarget->hasDSP() && Subtarget->useMulOps() &&
12580 FinalOpc == ARMISD::SMLAL && !AddeSubeNode->hasAnyUseOfValue(1) &&
12581 LowAddSub->getNode()->getOpcode() == ISD::Constant &&
12582 static_cast<ConstantSDNode *>(LowAddSub->getNode())->getZExtValue() ==
12583 0x80000000) {
12584 Ops.push_back(*HiAddSub);
12585 if (AddcSubcNode->getOpcode() == ARMISD::SUBC) {
12587 } else {
12589 }
12590 SDValue NewNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode), MVT::i32, Ops);
12592
12593 return SDValue(AddeSubeNode, 0);
12594 } else if (AddcSubcNode->getOpcode() == ARMISD::SUBC)
12595 // SMMLS is generated during instruction selection and the rest of this
12596 // function can not handle the case where AddcSubcNode is a SUBC.
12597 return SDValue();
12598
12599 // Finish building the operand list for {U/S}MLAL
12600 Ops.push_back(*LowAddSub);
12601 Ops.push_back(*HiAddSub);
12602
12604 DAG.getVTList(MVT::i32, MVT::i32), Ops);
12605
12606 // Replace the ADDs' nodes uses by the MLA node's values.
12607 SDValue HiMLALResult(MLALNode.getNode(), 1);
12609
12610 SDValue LoMLALResult(MLALNode.getNode(), 0);
12612
12613 // Return original node to notify the driver to stop replacing.
12614 return SDValue(AddeSubeNode, 0);
12615}
12616
12619 const ARMSubtarget *Subtarget) {
12620 // UMAAL is similar to UMLAL except that it adds two unsigned values.
12621 // While trying to combine for the other MLAL nodes, first search for the
12622 // chance to use UMAAL. Check if Addc uses a node which has already
12623 // been combined into a UMLAL. The other pattern is UMLAL using Addc/Adde
12624 // as the addend, and it's handled in PerformUMLALCombine.
12625
12626 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP())
12627 return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget);
12628
12629 // Check that we have a glued ADDC node.
12630 SDNode* AddcNode = AddeNode->getOperand(2).getNode();
12631 if (AddcNode->getOpcode() != ARMISD::ADDC)
12632 return SDValue();
12633
12634 // Find the converted UMAAL or quit if it doesn't exist.
12635 SDNode *UmlalNode = nullptr;
12636 SDValue AddHi;
12637 if (AddcNode->getOperand(0).getOpcode() == ARMISD::UMLAL) {
12638 UmlalNode = AddcNode->getOperand(0).getNode();
12639 AddHi = AddcNode->getOperand(1);
12640 } else if (AddcNode->getOperand(1).getOpcode() == ARMISD::UMLAL) {
12641 UmlalNode = AddcNode->getOperand(1).getNode();
12642 AddHi = AddcNode->getOperand(0);
12643 } else {
12644 return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget);
12645 }
12646
12647 // The ADDC should be glued to an ADDE node, which uses the same UMLAL as
12648 // the ADDC as well as Zero.
12649 if (!isNullConstant(UmlalNode->getOperand(3)))
12650 return SDValue();
12651
12652 if ((isNullConstant(AddeNode->getOperand(0)) &&
12653 AddeNode->getOperand(1).getNode() == UmlalNode) ||
12654 (AddeNode->getOperand(0).getNode() == UmlalNode &&
12655 isNullConstant(AddeNode->getOperand(1)))) {
12656 SelectionDAG &DAG = DCI.DAG;
12657 SDValue Ops[] = { UmlalNode->getOperand(0), UmlalNode->getOperand(1),
12658 UmlalNode->getOperand(2), AddHi };
12660 DAG.getVTList(MVT::i32, MVT::i32), Ops);
12661
12662 // Replace the ADDs' nodes uses by the UMAAL node's values.
12663 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), SDValue(UMAAL.getNode(), 1));
12664 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), SDValue(UMAAL.getNode(), 0));
12665
12666 // Return original node to notify the driver to stop replacing.
12667 return SDValue(AddeNode, 0);
12668 }
12669 return SDValue();
12670}
12671
12673 const ARMSubtarget *Subtarget) {
12674 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP())
12675 return SDValue();
12676
12677 // Check that we have a pair of ADDC and ADDE as operands.
12678 // Both addends of the ADDE must be zero.
12679 SDNode* AddcNode = N->getOperand(2).getNode();
12680 SDNode* AddeNode = N->getOperand(3).getNode();
12681 if ((AddcNode->getOpcode() == ARMISD::ADDC) &&
12682 (AddeNode->getOpcode() == ARMISD::ADDE) &&
12683 isNullConstant(AddeNode->getOperand(0)) &&
12684 isNullConstant(AddeNode->getOperand(1)) &&
12685 (AddeNode->getOperand(2).getNode() == AddcNode))
12686 return DAG.getNode(ARMISD::UMAAL, SDLoc(N),
12688 {N->getOperand(0), N->getOperand(1),
12689 AddcNode->getOperand(0), AddcNode->getOperand(1)});
12690 else
12691 return SDValue();
12692}
12693
12696 const ARMSubtarget *Subtarget) {
12697 SelectionDAG &DAG(DCI.DAG);
12698
12699 if (N->getOpcode() == ARMISD::SUBC) {
12700 // (SUBC (ADDE 0, 0, C), 1) -> C
12701 SDValue LHS = N->getOperand(0);
12702 SDValue RHS = N->getOperand(1);
12703 if (LHS->getOpcode() == ARMISD::ADDE &&
12704 isNullConstant(LHS->getOperand(0)) &&
12705 isNullConstant(LHS->getOperand(1)) && isOneConstant(RHS)) {
12706 return DCI.CombineTo(N, SDValue(N, 0), LHS->getOperand(2));
12707 }
12708 }
12709
12710 if (Subtarget->isThumb1Only()) {
12711 SDValue RHS = N->getOperand(1);
12713 int32_t imm = C->getSExtValue();
12714 if (imm < 0 && imm > std::numeric_limits<int>::min()) {
12715 SDLoc DL(N);
12716 RHS = DAG.getConstant(-imm, DL, MVT::i32);
12717 unsigned Opcode = (N->getOpcode() == ARMISD::ADDC) ? ARMISD::SUBC
12718 : ARMISD::ADDC;
12719 return DAG.getNode(Opcode, DL, N->getVTList(), N->getOperand(0), RHS);
12720 }
12721 }
12722 }
12723
12724 return SDValue();
12725}
12726
12729 const ARMSubtarget *Subtarget) {
12730 if (Subtarget->isThumb1Only()) {
12731 SelectionDAG &DAG = DCI.DAG;
12732 SDValue RHS = N->getOperand(1);
12734 int64_t imm = C->getSExtValue();
12735 if (imm < 0) {
12736 SDLoc DL(N);
12737
12738 // The with-carry-in form matches bitwise not instead of the negation.
12739 // Effectively, the inverse interpretation of the carry flag already
12740 // accounts for part of the negation.
12741 RHS = DAG.getConstant(~imm, DL, MVT::i32);
12742
12743 unsigned Opcode = (N->getOpcode() == ARMISD::ADDE) ? ARMISD::SUBE
12744 : ARMISD::ADDE;
12745 return DAG.getNode(Opcode, DL, N->getVTList(),
12746 N->getOperand(0), RHS, N->getOperand(2));
12747 }
12748 }
12749 } else if (N->getOperand(1)->getOpcode() == ISD::SMUL_LOHI) {
12750 return AddCombineTo64bitMLAL(N, DCI, Subtarget);
12751 }
12752 return SDValue();
12753}
12754
12757 const ARMSubtarget *Subtarget) {
12758 if (!Subtarget->hasMVEIntegerOps())
12759 return SDValue();
12760
12761 SDLoc dl(N);
12762 SDValue SetCC;
12763 SDValue LHS;
12764 SDValue RHS;
12765 ISD::CondCode CC;
12766 SDValue TrueVal;
12767 SDValue FalseVal;
12768
12769 if (N->getOpcode() == ISD::SELECT &&
12770 N->getOperand(0)->getOpcode() == ISD::SETCC) {
12771 SetCC = N->getOperand(0);
12772 LHS = SetCC->getOperand(0);
12773 RHS = SetCC->getOperand(1);
12774 CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
12775 TrueVal = N->getOperand(1);
12776 FalseVal = N->getOperand(2);
12777 } else if (N->getOpcode() == ISD::SELECT_CC) {
12778 LHS = N->getOperand(0);
12779 RHS = N->getOperand(1);
12780 CC = cast<CondCodeSDNode>(N->getOperand(4))->get();
12781 TrueVal = N->getOperand(2);
12782 FalseVal = N->getOperand(3);
12783 } else {
12784 return SDValue();
12785 }
12786
12787 unsigned int Opcode = 0;
12788 if ((TrueVal->getOpcode() == ISD::VECREDUCE_UMIN ||
12789 FalseVal->getOpcode() == ISD::VECREDUCE_UMIN) &&
12790 (CC == ISD::SETULT || CC == ISD::SETUGT)) {
12791 Opcode = ARMISD::VMINVu;
12792 if (CC == ISD::SETUGT)
12793 std::swap(TrueVal, FalseVal);
12794 } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_SMIN ||
12795 FalseVal->getOpcode() == ISD::VECREDUCE_SMIN) &&
12796 (CC == ISD::SETLT || CC == ISD::SETGT)) {
12797 Opcode = ARMISD::VMINVs;
12798 if (CC == ISD::SETGT)
12799 std::swap(TrueVal, FalseVal);
12800 } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_UMAX ||
12801 FalseVal->getOpcode() == ISD::VECREDUCE_UMAX) &&
12802 (CC == ISD::SETUGT || CC == ISD::SETULT)) {
12803 Opcode = ARMISD::VMAXVu;
12804 if (CC == ISD::SETULT)
12805 std::swap(TrueVal, FalseVal);
12806 } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_SMAX ||
12807 FalseVal->getOpcode() == ISD::VECREDUCE_SMAX) &&
12808 (CC == ISD::SETGT || CC == ISD::SETLT)) {
12809 Opcode = ARMISD::VMAXVs;
12810 if (CC == ISD::SETLT)
12811 std::swap(TrueVal, FalseVal);
12812 } else
12813 return SDValue();
12814
12815 // Normalise to the right hand side being the vector reduction
12816 switch (TrueVal->getOpcode()) {
12821 std::swap(LHS, RHS);
12822 std::swap(TrueVal, FalseVal);
12823 break;
12824 }
12825
12826 EVT VectorType = FalseVal->getOperand(0).getValueType();
12827
12830 return SDValue();
12831
12832 EVT VectorScalarType = VectorType.getVectorElementType();
12833
12834 // The values being selected must also be the ones being compared
12835 if (TrueVal != LHS || FalseVal != RHS)
12836 return SDValue();
12837
12838 EVT LeftType = LHS->getValueType(0);
12839 EVT RightType = RHS->getValueType(0);
12840
12841 // The types must match the reduced type too
12843 return SDValue();
12844
12845 // Legalise the scalar to an i32
12847 LHS = DCI.DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS);
12848
12849 // Generate the reduction as an i32 for legalisation purposes
12850 auto Reduction =
12851 DCI.DAG.getNode(Opcode, dl, MVT::i32, LHS, RHS->getOperand(0));
12852
12853 // The result isn't actually an i32 so truncate it back to its original type
12855 Reduction = DCI.DAG.getNode(ISD::TRUNCATE, dl, VectorScalarType, Reduction);
12856
12857 return Reduction;
12858}
12859
12860// A special combine for the vqdmulh family of instructions. This is one of the
12861// potential set of patterns that could patch this instruction. The base pattern
12862// you would expect to be min(max(ashr(mul(mul(sext(x), 2), sext(y)), 16))).
12863// This matches the different min(max(ashr(mul(mul(sext(x), sext(y)), 2), 16))),
12864// which llvm will have optimized to min(ashr(mul(sext(x), sext(y)), 15))) as
12865// the max is unnecessary.
12867 EVT VT = N->getValueType(0);
12868 SDValue Shft;
12870
12871 if (N->getOpcode() == ISD::SMIN) {
12872 Shft = N->getOperand(0);
12873 Clamp = isConstOrConstSplat(N->getOperand(1));
12874 } else if (N->getOpcode() == ISD::VSELECT) {
12875 // Detect a SMIN, which for an i64 node will be a vselect/setcc, not a smin.
12876 SDValue Cmp = N->getOperand(0);
12877 if (Cmp.getOpcode() != ISD::SETCC ||
12878 cast<CondCodeSDNode>(Cmp.getOperand(2))->get() != ISD::SETLT ||
12879 Cmp.getOperand(0) != N->getOperand(1) ||
12880 Cmp.getOperand(1) != N->getOperand(2))
12881 return SDValue();
12882 Shft = N->getOperand(1);
12883 Clamp = isConstOrConstSplat(N->getOperand(2));
12884 } else
12885 return SDValue();
12886
12887 if (!Clamp)
12888 return SDValue();
12889
12891 int ShftAmt = 0;
12892 switch (Clamp->getSExtValue()) {
12893 case (1 << 7) - 1:
12895 ShftAmt = 7;
12896 break;
12897 case (1 << 15) - 1:
12899 ShftAmt = 15;
12900 break;
12901 case (1ULL << 31) - 1:
12903 ShftAmt = 31;
12904 break;
12905 default:
12906 return SDValue();
12907 }
12908
12909 if (Shft.getOpcode() != ISD::SRA)
12910 return SDValue();
12911 ConstantSDNode *N1 = isConstOrConstSplat(Shft.getOperand(1));
12912 if (!N1 || N1->getSExtValue() != ShftAmt)
12913 return SDValue();
12914
12915 SDValue Mul = Shft.getOperand(0);
12916 if (Mul.getOpcode() != ISD::MUL)
12917 return SDValue();
12918
12919 SDValue Ext0 = Mul.getOperand(0);
12920 SDValue Ext1 = Mul.getOperand(1);
12921 if (Ext0.getOpcode() != ISD::SIGN_EXTEND ||
12922 Ext1.getOpcode() != ISD::SIGN_EXTEND)
12923 return SDValue();
12924 EVT VecVT = Ext0.getOperand(0).getValueType();
12925 if (!VecVT.isPow2VectorType() || VecVT.getVectorNumElements() == 1)
12926 return SDValue();
12927 if (Ext1.getOperand(0).getValueType() != VecVT ||
12928 VecVT.getScalarType() != ScalarType ||
12929 VT.getScalarSizeInBits() < ScalarType.getScalarSizeInBits() * 2)
12930 return SDValue();
12931
12932 SDLoc DL(Mul);
12933 unsigned LegalLanes = 128 / (ShftAmt + 1);
12935 // For types smaller than legal vectors extend to be legal and only use needed
12936 // lanes.
12937 if (VecVT.getSizeInBits() < 128) {
12938 EVT ExtVecVT =
12940 VecVT.getVectorNumElements());
12941 SDValue Inp0 =
12942 DAG.getNode(ISD::ANY_EXTEND, DL, ExtVecVT, Ext0.getOperand(0));
12943 SDValue Inp1 =
12944 DAG.getNode(ISD::ANY_EXTEND, DL, ExtVecVT, Ext1.getOperand(0));
12948 SDValue Trunc = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, ExtVecVT, VQDMULH);
12949 Trunc = DAG.getNode(ISD::TRUNCATE, DL, VecVT, Trunc);
12950 return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Trunc);
12951 }
12952
12953 // For larger types, split into legal sized chunks.
12954 assert(VecVT.getSizeInBits() % 128 == 0 && "Expected a power2 type");
12955 unsigned NumParts = VecVT.getSizeInBits() / 128;
12957 for (unsigned I = 0; I < NumParts; ++I) {
12958 SDValue Inp0 =
12959 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LegalVecVT, Ext0.getOperand(0),
12961 SDValue Inp1 =
12962 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LegalVecVT, Ext1.getOperand(0),
12965 Parts.push_back(VQDMULH);
12966 }
12967 return DAG.getNode(ISD::SIGN_EXTEND, DL, VT,
12968 DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Parts));
12969}
12970
12973 const ARMSubtarget *Subtarget) {
12974 if (!Subtarget->hasMVEIntegerOps())
12975 return SDValue();
12976
12977 if (SDValue V = PerformVQDMULHCombine(N, DCI.DAG))
12978 return V;
12979
12980 // Transforms vselect(not(cond), lhs, rhs) into vselect(cond, rhs, lhs).
12981 //
12982 // We need to re-implement this optimization here as the implementation in the
12983 // Target-Independent DAGCombiner does not handle the kind of constant we make
12984 // (it calls isConstOrConstSplat with AllowTruncation set to false - and for
12985 // good reason, allowing truncation there would break other targets).
12986 //
12987 // Currently, this is only done for MVE, as it's the only target that benefits
12988 // from this transformation (e.g. VPNOT+VPSEL becomes a single VPSEL).
12989 if (N->getOperand(0).getOpcode() != ISD::XOR)
12990 return SDValue();
12991 SDValue XOR = N->getOperand(0);
12992
12993 // Check if the XOR's RHS is either a 1, or a BUILD_VECTOR of 1s.
12994 // It is important to check with truncation allowed as the BUILD_VECTORs we
12995 // generate in those situations will truncate their operands.
12996 ConstantSDNode *Const =
12997 isConstOrConstSplat(XOR->getOperand(1), /*AllowUndefs*/ false,
12998 /*AllowTruncation*/ true);
12999 if (!Const || !Const->isOne())
13000 return SDValue();
13001
13002 // Rewrite into vselect(cond, rhs, lhs).
13003 SDValue Cond = XOR->getOperand(0);
13004 SDValue LHS = N->getOperand(1);
13005 SDValue RHS = N->getOperand(2);
13006 EVT Type = N->getValueType(0);
13007 return DCI.DAG.getNode(ISD::VSELECT, SDLoc(N), Type, Cond, RHS, LHS);
13008}
13009
13012 const ARMSubtarget *Subtarget) {
13013 SDValue res;
13014 SelectionDAG &DAG = DCI.DAG;
13015 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
13016
13017 if (TLI.isOperationLegal(N->getOpcode(), N->getValueType(0)))
13018 return SDValue();
13019
13020 if (!TLI.expandABS(N, res, DAG))
13021 return SDValue();
13022
13023 return res;
13024}
13025
13026/// PerformADDECombine - Target-specific dag combine transform from
13027/// ARMISD::ADDC, ARMISD::ADDE, and ISD::MUL_LOHI to MLAL or
13028/// ARMISD::ADDC, ARMISD::ADDE and ARMISD::UMLAL to ARMISD::UMAAL
13031 const ARMSubtarget *Subtarget) {
13032 // Only ARM and Thumb2 support UMLAL/SMLAL.
13033 if (Subtarget->isThumb1Only())
13034 return PerformAddeSubeCombine(N, DCI, Subtarget);
13035
13036 // Only perform the checks after legalize when the pattern is available.
13037 if (DCI.isBeforeLegalize()) return SDValue();
13038
13039 return AddCombineTo64bitUMAAL(N, DCI, Subtarget);
13040}
13041
13042/// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
13043/// operands N0 and N1. This is a helper for PerformADDCombine that is
13044/// called with the default operands, and if that fails, with commuted
13045/// operands.
13048 const ARMSubtarget *Subtarget){
13049 // Attempt to create vpadd for this add.
13050 if (SDValue Result = AddCombineToVPADD(N, N0, N1, DCI, Subtarget))
13051 return Result;
13052
13053 // Attempt to create vpaddl for this add.
13054 if (SDValue Result = AddCombineVUZPToVPADDL(N, N0, N1, DCI, Subtarget))
13055 return Result;
13056 if (SDValue Result = AddCombineBUILD_VECTORToVPADDL(N, N0, N1, DCI,
13057 Subtarget))
13058 return Result;
13059
13060 // fold (add (select cc, 0, c), x) -> (select cc, x, (add, x, c))
13061 if (N0.getNode()->hasOneUse())
13062 if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI))
13063 return Result;
13064 return SDValue();
13065}
13066
13068 const ARMSubtarget *Subtarget) {
13069 if (!Subtarget->hasMVEIntegerOps() || N->getValueType(0) != MVT::i64)
13070 return SDValue();
13071
13072 SDValue N0 = N->getOperand(0);
13073 SDValue N1 = N->getOperand(1);
13074
13075 // We are looking for a i64 add of a VADDLVx. Due to these being i64's, this
13076 // will look like:
13077 // t1: i32,i32 = ARMISD::VADDLVs x
13078 // t2: i64 = build_pair t1, t1:1
13079 // t3: i64 = add t2, y
13080 // Otherwise we try to push the add up above VADDLVAx, to potentially allow
13081 // the add to be simplified seperately.
13082 // We also need to check for sext / zext and commutitive adds.
13083 auto MakeVecReduce = [&](unsigned Opcode, unsigned OpcodeA, SDValue NA,
13084 SDValue NB) {
13085 if (NB->getOpcode() != ISD::BUILD_PAIR)
13086 return SDValue();
13087 SDValue VecRed = NB->getOperand(0);
13088 if ((VecRed->getOpcode() != Opcode && VecRed->getOpcode() != OpcodeA) ||
13089 VecRed.getResNo() != 0 ||
13090 NB->getOperand(1) != SDValue(VecRed.getNode(), 1))
13091 return SDValue();
13092
13093 SDLoc dl(N);
13094 if (VecRed->getOpcode() == OpcodeA) {
13095 // add(NA, VADDLVA(Inp), Y) -> VADDLVA(add(NA, Inp), Y)
13096 SDValue Inp = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64,
13097 VecRed.getOperand(0), VecRed.getOperand(1));
13098 NA = DAG.getNode(ISD::ADD, dl, MVT::i64, Inp, NA);
13099 }
13100
13102 Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, NA,
13103 DAG.getConstant(0, dl, MVT::i32)));
13104 Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, NA,
13105 DAG.getConstant(1, dl, MVT::i32)));
13106 unsigned S = VecRed->getOpcode() == OpcodeA ? 2 : 0;
13107 for (unsigned I = S, E = VecRed.getNumOperands(); I < E; I++)
13108 Ops.push_back(VecRed->getOperand(I));
13109 SDValue Red =
13110 DAG.getNode(OpcodeA, dl, DAG.getVTList({MVT::i32, MVT::i32}), Ops);
13111 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Red,
13112 SDValue(Red.getNode(), 1));
13113 };
13114
13116 return M;
13118 return M;
13120 return M;
13122 return M;
13124 return M;
13126 return M;
13128 return M;
13130 return M;
13132 return M;
13134 return M;
13136 return M;
13138 return M;
13140 return M;
13142 return M;
13144 return M;
13146 return M;
13147 return SDValue();
13148}
13149
13150bool
13152 CombineLevel Level) const {
13153 if (Level == BeforeLegalizeTypes)
13154 return true;
13155
13156 if (N->getOpcode() != ISD::SHL)
13157 return true;
13158
13159 if (Subtarget->isThumb1Only()) {
13160 // Avoid making expensive immediates by commuting shifts. (This logic
13161 // only applies to Thumb1 because ARM and Thumb2 immediates can be shifted
13162 // for free.)
13163 if (N->getOpcode() != ISD::SHL)
13164 return true;
13165 SDValue N1 = N->getOperand(0);
13166 if (N1->getOpcode() != ISD::ADD && N1->getOpcode() != ISD::AND &&
13167 N1->getOpcode() != ISD::OR && N1->getOpcode() != ISD::XOR)
13168 return true;
13169 if (auto *Const = dyn_cast<ConstantSDNode>(N1->getOperand(1))) {
13170 if (Const->getAPIntValue().ult(256))
13171 return false;
13172 if (N1->getOpcode() == ISD::ADD && Const->getAPIntValue().slt(0) &&
13173 Const->getAPIntValue().sgt(-256))
13174 return false;
13175 }
13176 return true;
13177 }
13178
13179 // Turn off commute-with-shift transform after legalization, so it doesn't
13180 // conflict with PerformSHLSimplify. (We could try to detect when
13181 // PerformSHLSimplify would trigger more precisely, but it isn't
13182 // really necessary.)
13183 return false;
13184}
13185
13187 const SDNode *N, CombineLevel Level) const {
13188 if (!Subtarget->isThumb1Only())
13189 return true;
13190
13191 if (Level == BeforeLegalizeTypes)
13192 return true;
13193
13194 return false;
13195}
13196
13198 if (!Subtarget->hasNEON()) {
13199 if (Subtarget->isThumb1Only())
13200 return VT.getScalarSizeInBits() <= 32;
13201 return true;
13202 }
13203 return VT.isScalarInteger();
13204}
13205
13208 const ARMSubtarget *ST) {
13209 // Allow the generic combiner to identify potential bswaps.
13210 if (DCI.isBeforeLegalize())
13211 return SDValue();
13212
13213 // DAG combiner will fold:
13214 // (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2)
13215 // (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2
13216 // Other code patterns that can be also be modified have the following form:
13217 // b + ((a << 1) | 510)
13218 // b + ((a << 1) & 510)
13219 // b + ((a << 1) ^ 510)
13220 // b + ((a << 1) + 510)
13221
13222 // Many instructions can perform the shift for free, but it requires both
13223 // the operands to be registers. If c1 << c2 is too large, a mov immediate
13224 // instruction will needed. So, unfold back to the original pattern if:
13225 // - if c1 and c2 are small enough that they don't require mov imms.
13226 // - the user(s) of the node can perform an shl
13227
13228 // No shifted operands for 16-bit instructions.
13229 if (ST->isThumb() && ST->isThumb1Only())
13230 return SDValue();
13231
13232 // Check that all the users could perform the shl themselves.
13233 for (auto U : N->uses()) {
13234 switch(U->getOpcode()) {
13235 default:
13236 return SDValue();
13237 case ISD::SUB:
13238 case ISD::ADD:
13239 case ISD::AND:
13240 case ISD::OR:
13241 case ISD::XOR:
13242 case ISD::SETCC:
13243 case ARMISD::CMP:
13244 // Check that the user isn't already using a constant because there
13245 // aren't any instructions that support an immediate operand and a
13246 // shifted operand.
13247 if (isa<ConstantSDNode>(U->getOperand(0)) ||
13248 isa<ConstantSDNode>(U->getOperand(1)))
13249 return SDValue();
13250
13251 // Check that it's not already using a shift.
13252 if (U->getOperand(0).getOpcode() == ISD::SHL ||
13253 U->getOperand(1).getOpcode() == ISD::SHL)
13254 return SDValue();
13255 break;
13256 }
13257 }
13258
13259 if (N->getOpcode() != ISD::ADD && N->getOpcode() != ISD::OR &&
13260 N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND)
13261 return SDValue();
13262
13263 if (N->getOperand(0).getOpcode() != ISD::SHL)
13264 return SDValue();
13265
13266 SDValue SHL = N->getOperand(0);
13267
13268 auto *C1ShlC2 = dyn_cast<ConstantSDNode>(N->getOperand(1));
13269 auto *C2 = dyn_cast<ConstantSDNode>(SHL.getOperand(1));
13270 if (!C1ShlC2 || !C2)
13271 return SDValue();
13272
13273 APInt C2Int = C2->getAPIntValue();
13274 APInt C1Int = C1ShlC2->getAPIntValue();
13275
13276 // Check that performing a lshr will not lose any information.
13277 APInt Mask = APInt::getHighBitsSet(C2Int.getBitWidth(),
13278 C2Int.getBitWidth() - C2->getZExtValue());
13279 if ((C1Int & Mask) != C1Int)
13280 return SDValue();
13281
13282 // Shift the first constant.
13283 C1Int.lshrInPlace(C2Int);
13284
13285 // The immediates are encoded as an 8-bit value that can be rotated.
13286 auto LargeImm = [](const APInt &Imm) {
13287 unsigned Zeros = Imm.countLeadingZeros() + Imm.countTrailingZeros();
13288 return Imm.getBitWidth() - Zeros > 8;
13289 };
13290
13291 if (LargeImm(C1Int) || LargeImm(C2Int))
13292 return SDValue();
13293
13294 SelectionDAG &DAG = DCI.DAG;
13295 SDLoc dl(N);
13296 SDValue X = SHL.getOperand(0);
13297 SDValue BinOp = DAG.getNode(N->getOpcode(), dl, MVT::i32, X,
13298 DAG.getConstant(C1Int, dl, MVT::i32));
13299 // Shift left to compensate for the lshr of C1Int.
13300 SDValue Res = DAG.getNode(ISD::SHL, dl, MVT::i32, BinOp, SHL.getOperand(1));
13301
13302 LLVM_DEBUG(dbgs() << "Simplify shl use:\n"; SHL.getOperand(0).dump();
13303 SHL.dump(); N->dump());
13304 LLVM_DEBUG(dbgs() << "Into:\n"; X.dump(); BinOp.dump(); Res.dump());
13305 return Res;
13306}
13307
13308
13309/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
13310///
13313 const ARMSubtarget *Subtarget) {
13314 SDValue N0 = N->getOperand(0);
13315 SDValue N1 = N->getOperand(1);
13316
13317 // Only works one way, because it needs an immediate operand.
13318 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
13319 return Result;
13320
13321 if (SDValue Result = PerformADDVecReduce(N, DCI.DAG, Subtarget))
13322 return Result;
13323
13324 // First try with the default operand order.
13325 if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget))
13326 return Result;
13327
13328 // If that didn't work, try again with the operands commuted.
13329 return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget);
13330}
13331
13332// Combine (sub 0, (csinc X, Y, CC)) -> (csinv -X, Y, CC)
13333// providing -X is as cheap as X (currently, just a constant).
13335 if (N->getValueType(0) != MVT::i32 || !isNullConstant(N->getOperand(0)))
13336 return SDValue();
13337 SDValue CSINC = N->getOperand(1);
13338 if (CSINC.getOpcode() != ARMISD::CSINC || !CSINC.hasOneUse())
13339 return SDValue();
13340
13341 ConstantSDNode *X = dyn_cast<ConstantSDNode>(CSINC.getOperand(0));
13342 if (!X)
13343 return SDValue();
13344
13345 return DAG.getNode(ARMISD::CSINV, SDLoc(N), MVT::i32,
13346 DAG.getNode(ISD::SUB, SDLoc(N), MVT::i32, N->getOperand(0),
13347 CSINC.getOperand(0)),
13348 CSINC.getOperand(1), CSINC.getOperand(2),
13349 CSINC.getOperand(3));
13350}
13351
13352/// PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB.
13353///
13356 const ARMSubtarget *Subtarget) {
13357 SDValue N0 = N->getOperand(0);
13358 SDValue N1 = N->getOperand(1);
13359
13360 // fold (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c))
13361 if (N1.getNode()->hasOneUse())
13362 if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI))
13363 return Result;
13364
13365 if (SDValue R = PerformSubCSINCCombine(N, DCI.DAG))
13366 return R;
13367
13368 if (!Subtarget->hasMVEIntegerOps() || !N->getValueType(0).isVector())
13369 return SDValue();
13370
13371 // Fold (sub (ARMvmovImm 0), (ARMvdup x)) -> (ARMvdup (sub 0, x))
13372 // so that we can readily pattern match more mve instructions which can use
13373 // a scalar operand.
13374 SDValue VDup = N->getOperand(1);
13375 if (VDup->getOpcode() != ARMISD::VDUP)
13376 return SDValue();
13377
13378 SDValue VMov = N->getOperand(0);
13379 if (VMov->getOpcode() == ISD::BITCAST)
13380 VMov = VMov->getOperand(0);
13381
13382 if (VMov->getOpcode() != ARMISD::VMOVIMM || !isZeroVector(VMov))
13383 return SDValue();
13384
13385 SDLoc dl(N);
13386 SDValue Negate = DCI.DAG.getNode(ISD::SUB, dl, MVT::i32,
13387 DCI.DAG.getConstant(0, dl, MVT::i32),
13388 VDup->getOperand(0));
13389 return DCI.DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0), Negate);
13390}
13391
13392/// PerformVMULCombine
13393/// Distribute (A + B) * C to (A * C) + (B * C) to take advantage of the
13394/// special multiplier accumulator forwarding.
13395/// vmul d3, d0, d2
13396/// vmla d3, d1, d2
13397/// is faster than
13398/// vadd d3, d0, d1
13399/// vmul d3, d3, d2
13400// However, for (A + B) * (A + B),
13401// vadd d2, d0, d1
13402// vmul d3, d0, d2
13403// vmla d3, d1, d2
13404// is slower than
13405// vadd d2, d0, d1
13406// vmul d3, d2, d2
13409 const ARMSubtarget *Subtarget) {
13410 if (!Subtarget->hasVMLxForwarding())
13411 return SDValue();
13412
13413 SelectionDAG &DAG = DCI.DAG;
13414 SDValue N0 = N->getOperand(0);
13415 SDValue N1 = N->getOperand(1);
13416 unsigned Opcode = N0.getOpcode();
13417 if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
13418 Opcode != ISD::FADD && Opcode != ISD::FSUB) {
13419 Opcode = N1.getOpcode();
13420 if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
13421 Opcode != ISD::FADD && Opcode != ISD::FSUB)
13422 return SDValue();
13423 std::swap(N0, N1);
13424 }
13425
13426 if (N0 == N1)
13427 return SDValue();
13428
13429 EVT VT = N->getValueType(0);
13430 SDLoc DL(N);
13431 SDValue N00 = N0->getOperand(0);
13432 SDValue N01 = N0->getOperand(1);
13433 return DAG.getNode(Opcode, DL, VT,
13434 DAG.getNode(ISD::MUL, DL, VT, N00, N1),
13435 DAG.getNode(ISD::MUL, DL, VT, N01, N1));
13436}
13437
13439 const ARMSubtarget *Subtarget) {
13440 EVT VT = N->getValueType(0);
13441 if (VT != MVT::v2i64)
13442 return SDValue();
13443
13444 SDValue N0 = N->getOperand(0);
13445 SDValue N1 = N->getOperand(1);
13446
13447 auto IsSignExt = [&](SDValue Op) {
13448 if (Op->getOpcode() != ISD::SIGN_EXTEND_INREG)
13449 return SDValue();
13450 EVT VT = cast<VTSDNode>(Op->getOperand(1))->getVT();
13451 if (VT.getScalarSizeInBits() == 32)
13452 return Op->getOperand(0);
13453 return SDValue();
13454 };
13455 auto IsZeroExt = [&](SDValue Op) {
13456 // Zero extends are a little more awkward. At the point we are matching
13457 // this, we are looking for an AND with a (-1, 0, -1, 0) buildvector mask.
13458 // That might be before of after a bitcast depending on how the and is
13459 // placed. Because this has to look through bitcasts, it is currently only
13460 // supported on LE.
13461 if (!Subtarget->isLittle())
13462 return SDValue();
13463
13464 SDValue And = Op;
13465 if (And->getOpcode() == ISD::BITCAST)
13466 And = And->getOperand(0);
13467 if (And->getOpcode() != ISD::AND)
13468 return SDValue();
13469 SDValue Mask = And->getOperand(1);
13470 if (Mask->getOpcode() == ISD::BITCAST)
13471 Mask = Mask->getOperand(0);
13472
13473 if (Mask->getOpcode() != ISD::BUILD_VECTOR ||
13474 Mask.getValueType() != MVT::v4i32)
13475 return SDValue();
13476 if (isAllOnesConstant(Mask->getOperand(0)) &&
13477 isNullConstant(Mask->getOperand(1)) &&
13478 isAllOnesConstant(Mask->getOperand(2)) &&
13479 isNullConstant(Mask->getOperand(3)))
13480 return And->getOperand(0);
13481 return SDValue();
13482 };
13483
13484 SDLoc dl(N);
13485 if (SDValue Op0 = IsSignExt(N0)) {
13486 if (SDValue Op1 = IsSignExt(N1)) {
13489 return DAG.getNode(ARMISD::VMULLs, dl, VT, New0a, New1a);
13490 }
13491 }
13492 if (SDValue Op0 = IsZeroExt(N0)) {
13493 if (SDValue Op1 = IsZeroExt(N1)) {
13496 return DAG.getNode(ARMISD::VMULLu, dl, VT, New0a, New1a);
13497 }
13498 }
13499
13500 return SDValue();
13501}
13502
13505 const ARMSubtarget *Subtarget) {
13506 SelectionDAG &DAG = DCI.DAG;
13507
13508 EVT VT = N->getValueType(0);
13509 if (Subtarget->hasMVEIntegerOps() && VT == MVT::v2i64)
13510 return PerformMVEVMULLCombine(N, DAG, Subtarget);
13511
13512 if (Subtarget->isThumb1Only())
13513 return SDValue();
13514
13515 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
13516 return SDValue();
13517
13518 if (VT.is64BitVector() || VT.is128BitVector())
13519 return PerformVMULCombine(N, DCI, Subtarget);
13520 if (VT != MVT::i32)
13521 return SDValue();
13522
13523 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
13524 if (!C)
13525 return SDValue();
13526
13527 int64_t MulAmt = C->getSExtValue();
13528 unsigned ShiftAmt = countTrailingZeros<uint64_t>(MulAmt);
13529
13530 ShiftAmt = ShiftAmt & (32 - 1);
13531 SDValue V = N->getOperand(0);
13532 SDLoc DL(N);
13533
13534 SDValue Res;
13535 MulAmt >>= ShiftAmt;
13536
13537 if (MulAmt >= 0) {
13538 if (isPowerOf2_32(MulAmt - 1)) {
13539 // (mul x, 2^N + 1) => (add (shl x, N), x)
13540 Res = DAG.getNode(ISD::ADD, DL, VT,
13541 V,
13542 DAG.getNode(ISD::SHL, DL, VT,
13543 V,
13544 DAG.getConstant(Log2_32(MulAmt - 1), DL,
13545 MVT::i32)));
13546 } else if (isPowerOf2_32(MulAmt + 1)) {
13547 // (mul x, 2^N - 1) => (sub (shl x, N), x)
13548 Res = DAG.getNode(ISD::SUB, DL, VT,
13549 DAG.getNode(ISD::SHL, DL, VT,
13550 V,
13551 DAG.getConstant(Log2_32(MulAmt + 1), DL,
13552 MVT::i32)),
13553 V);
13554 } else
13555 return SDValue();
13556 } else {
13557 uint64_t MulAmtAbs = -MulAmt;
13558 if (isPowerOf2_32(MulAmtAbs + 1)) {
13559 // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
13560 Res = DAG.getNode(ISD::SUB, DL, VT,
13561 V,
13562 DAG.getNode(ISD::SHL, DL, VT,
13563 V,
13564 DAG.getConstant(Log2_32(MulAmtAbs + 1), DL,
13565 MVT::i32)));
13566 } else if (isPowerOf2_32(MulAmtAbs - 1)) {
13567 // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
13568 Res = DAG.getNode(ISD::ADD, DL, VT,
13569 V,
13570 DAG.getNode(ISD::SHL, DL, VT,
13571 V,
13572 DAG.getConstant(Log2_32(MulAmtAbs - 1), DL,
13573 MVT::i32)));
13574 Res = DAG.getNode(ISD::SUB, DL, VT,
13575 DAG.getConstant(0, DL, MVT::i32), Res);
13576 } else
13577 return SDValue();
13578 }
13579
13580 if (ShiftAmt != 0)
13581 Res = DAG.getNode(ISD::SHL, DL, VT,
13582 Res, DAG.getConstant(ShiftAmt, DL, MVT::i32));
13583
13584 // Do not add new nodes to DAG combiner worklist.
13585 DCI.CombineTo(N, Res, false);
13586 return SDValue();
13587}
13588
13591 const ARMSubtarget *Subtarget) {
13592 // Allow DAGCombine to pattern-match before we touch the canonical form.
13593 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
13594 return SDValue();
13595
13596 if (N->getValueType(0) != MVT::i32)
13597 return SDValue();
13598
13599 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N->getOperand(1));
13600 if (!N1C)
13601 return SDValue();
13602
13603 uint32_t C1 = (uint32_t)N1C->getZExtValue();
13604 // Don't transform uxtb/uxth.
13605 if (C1 == 255 || C1 == 65535)
13606 return SDValue();
13607
13608 SDNode *N0 = N->getOperand(0).getNode();
13609 if (!N0->hasOneUse())
13610 return SDValue();
13611
13612 if (N0->getOpcode() != ISD::SHL && N0->getOpcode() != ISD::SRL)
13613 return SDValue();
13614
13615 bool LeftShift = N0->getOpcode() == ISD::SHL;
13616
13618 if (!N01C)
13619 return SDValue();
13620
13621 uint32_t C2 = (uint32_t)N01C->getZExtValue();
13622 if (!C2 || C2 >= 32)
13623 return SDValue();
13624
13625 // Clear irrelevant bits in the mask.
13626 if (LeftShift)
13627 C1 &= (-1U << C2);
13628 else
13629 C1 &= (-1U >> C2);
13630
13631 SelectionDAG &DAG = DCI.DAG;
13632 SDLoc DL(N);
13633
13634 // We have a pattern of the form "(and (shl x, c2) c1)" or
13635 // "(and (srl x, c2) c1)", where c1 is a shifted mask. Try to
13636 // transform to a pair of shifts, to save materializing c1.
13637
13638 // First pattern: right shift, then mask off leading bits.
13639 // FIXME: Use demanded bits?
13640 if (!LeftShift && isMask_32(C1)) {
13642 if (C2 < C3) {
13643 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
13644 DAG.getConstant(C3 - C2, DL, MVT::i32));
13645 return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL,
13646 DAG.getConstant(C3, DL, MVT::i32));
13647 }
13648 }
13649
13650 // First pattern, reversed: left shift, then mask off trailing bits.
13651 if (LeftShift && isMask_32(~C1)) {
13653 if (C2 < C3) {
13654 SDValue SHL = DAG.getNode(ISD::SRL, DL, MVT::i32, N0->getOperand(0),
13655 DAG.getConstant(C3 - C2, DL, MVT::i32));
13656 return DAG.getNode(ISD::SHL, DL, MVT::i32, SHL,
13657 DAG.getConstant(C3, DL, MVT::i32));
13658 }
13659 }
13660
13661 // Second pattern: left shift, then mask off leading bits.
13662 // FIXME: Use demanded bits?
13663 if (LeftShift && isShiftedMask_32(C1)) {
13666 if (Trailing == C2 && C2 + C3 < 32) {
13667 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
13668 DAG.getConstant(C2 + C3, DL, MVT::i32));
13669 return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL,
13670 DAG.getConstant(C3, DL, MVT::i32));
13671 }
13672 }
13673
13674 // Second pattern, reversed: right shift, then mask off trailing bits.
13675 // FIXME: Handle other patterns of known/demanded bits.
13676 if (!LeftShift && isShiftedMask_32(C1)) {
13679 if (Leading == C2 && C2 + C3 < 32) {
13680 SDValue SHL = DAG.getNode(ISD::SRL, DL, MVT::i32, N0->getOperand(0),
13681 DAG.getConstant(C2 + C3, DL, MVT::i32));
13682 return DAG.getNode(ISD::SHL, DL, MVT::i32, SHL,
13683 DAG.getConstant(C3, DL, MVT::i32));
13684 }
13685 }
13686
13687 // FIXME: Transform "(and (shl x, c2) c1)" ->
13688 // "(shl (and x, c1>>c2), c2)" if "c1 >> c2" is a cheaper immediate than
13689 // c1.
13690 return SDValue();
13691}
13692
13695 const ARMSubtarget *Subtarget) {
13696 // Attempt to use immediate-form VBIC
13698 SDLoc dl(N);
13699 EVT VT = N->getValueType(0);
13700 SelectionDAG &DAG = DCI.DAG;
13701
13702 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT) || VT == MVT::v4i1 ||
13703 VT == MVT::v8i1 || VT == MVT::v16i1)
13704 return SDValue();
13705
13706 APInt SplatBits, SplatUndef;
13707 unsigned SplatBitSize;
13708 bool HasAnyUndefs;
13709 if (BVN && (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) &&
13710 BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
13711 if (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32 ||
13712 SplatBitSize == 64) {
13713 EVT VbicVT;
13714 SDValue Val = isVMOVModifiedImm((~SplatBits).getZExtValue(),
13715 SplatUndef.getZExtValue(), SplatBitSize,
13716 DAG, dl, VbicVT, VT, OtherModImm);
13717 if (Val.getNode()) {
13718 SDValue Input =
13719 DAG.getNode(ISD::BITCAST, dl, VbicVT, N->getOperand(0));
13720 SDValue Vbic = DAG.getNode(ARMISD::VBICIMM, dl, VbicVT, Input, Val);
13721 return DAG.getNode(ISD::BITCAST, dl, VT, Vbic);
13722 }
13723 }
13724 }
13725
13726 if (!Subtarget->isThumb1Only()) {
13727 // fold (and (select cc, -1, c), x) -> (select cc, x, (and, x, c))
13728 if (SDValue Result = combineSelectAndUseCommutative(N, true, DCI))
13729 return Result;
13730
13731 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
13732 return Result;
13733 }
13734
13735 if (Subtarget->isThumb1Only())
13736 if (SDValue Result = CombineANDShift(N, DCI, Subtarget))
13737 return Result;
13738
13739 return SDValue();
13740}
13741
13742// Try combining OR nodes to SMULWB, SMULWT.
13745 const ARMSubtarget *Subtarget) {
13746 if (!Subtarget->hasV6Ops() ||
13747 (Subtarget->isThumb() &&
13748 (!Subtarget->hasThumb2() || !Subtarget->hasDSP())))
13749 return SDValue();
13750
13751 SDValue SRL = OR->getOperand(0);
13752 SDValue SHL = OR->getOperand(1);
13753
13754 if (SRL.getOpcode() != ISD::SRL || SHL.getOpcode() != ISD::SHL) {
13755 SRL = OR->getOperand(1);
13756 SHL = OR->getOperand(0);
13757 }
13758 if (!isSRL16(SRL) || !isSHL16(SHL))
13759 return SDValue();
13760
13761 // The first operands to the shifts need to be the two results from the
13762 // same smul_lohi node.
13763 if ((SRL.getOperand(0).getNode() != SHL.getOperand(0).getNode()) ||
13764 SRL.getOperand(0).getOpcode() != ISD::SMUL_LOHI)
13765 return SDValue();
13766
13767 SDNode *SMULLOHI = SRL.getOperand(0).getNode();
13768 if (SRL.getOperand(0) != SDValue(SMULLOHI, 0) ||
13769 SHL.getOperand(0) != SDValue(SMULLOHI, 1))
13770 return SDValue();
13771
13772 // Now we have:
13773 // (or (srl (smul_lohi ?, ?), 16), (shl (smul_lohi ?, ?), 16)))
13774 // For SMUL[B|T] smul_lohi will take a 32-bit and a 16-bit arguments.
13775 // For SMUWB the 16-bit value will signed extended somehow.
13776 // For SMULWT only the SRA is required.
13777 // Check both sides of SMUL_LOHI
13778 SDValue OpS16 = SMULLOHI->getOperand(0);
13779 SDValue OpS32 = SMULLOHI->getOperand(1);
13780
13781 SelectionDAG &DAG = DCI.DAG;
13782 if (!isS16(OpS16, DAG) && !isSRA16(OpS16)) {
13783 OpS16 = OpS32;
13784 OpS32 = SMULLOHI->getOperand(0);
13785 }
13786
13787 SDLoc dl(OR);
13788 unsigned Opcode = 0;
13789 if (isS16(OpS16, DAG))
13790 Opcode = ARMISD::SMULWB;
13791 else if (isSRA16(OpS16)) {
13792 Opcode = ARMISD::SMULWT;
13793 OpS16 = OpS16->getOperand(0);
13794 }
13795 else
13796 return SDValue();
13797
13798 SDValue Res = DAG.getNode(Opcode, dl, MVT::i32, OpS32, OpS16);
13799 DAG.ReplaceAllUsesOfValueWith(SDValue(OR, 0), Res);
13800 return SDValue(OR, 0);
13801}
13802
13805 const ARMSubtarget *Subtarget) {
13806 // BFI is only available on V6T2+
13807 if (Subtarget->isThumb1Only() || !Subtarget->hasV6T2Ops())
13808 return SDValue();
13809
13810 EVT VT = N->getValueType(0);
13811 SDValue N0 = N->getOperand(0);
13812 SDValue N1 = N->getOperand(1);
13813 SelectionDAG &DAG = DCI.DAG;
13814 SDLoc DL(N);
13815 // 1) or (and A, mask), val => ARMbfi A, val, mask
13816 // iff (val & mask) == val
13817 //
13818 // 2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
13819 // 2a) iff isBitFieldInvertedMask(mask) && isBitFieldInvertedMask(~mask2)
13820 // && mask == ~mask2
13821 // 2b) iff isBitFieldInvertedMask(~mask) && isBitFieldInvertedMask(mask2)
13822 // && ~mask == mask2
13823 // (i.e., copy a bitfield value into another bitfield of the same width)
13824
13825 if (VT != MVT::i32)
13826 return SDValue();
13827
13828 SDValue N00 = N0.getOperand(0);
13829
13830 // The value and the mask need to be constants so we can verify this is
13831 // actually a bitfield set. If the mask is 0xffff, we can do better
13832 // via a movt instruction, so don't use BFI in that case.
13833 SDValue MaskOp = N0.getOperand(1);
13835 if (!MaskC)
13836 return SDValue();
13837 unsigned Mask = MaskC->getZExtValue();
13838 if (Mask == 0xffff)
13839 return SDValue();
13840 SDValue Res;
13841 // Case (1): or (and A, mask), val => ARMbfi A, val, mask
13843 if (N1C) {
13844 unsigned Val = N1C->getZExtValue();
13845 if ((Val & ~Mask) != Val)
13846 return SDValue();
13847
13848 if (ARM::isBitFieldInvertedMask(Mask)) {
13849 Val >>= countTrailingZeros(~Mask);
13850
13851 Res = DAG.getNode(ARMISD::BFI, DL, VT, N00,
13852 DAG.getConstant(Val, DL, MVT::i32),
13853 DAG.getConstant(Mask, DL, MVT::i32));
13854
13855 DCI.CombineTo(N, Res, false);
13856 // Return value from the original node to inform the combiner than N is
13857 // now dead.
13858 return SDValue(N, 0);
13859 }
13860 } else if (N1.getOpcode() == ISD::AND) {
13861 // case (2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
13863 if (!N11C)
13864 return SDValue();
13865 unsigned Mask2 = N11C->getZExtValue();
13866
13867 // Mask and ~Mask2 (or reverse) must be equivalent for the BFI pattern
13868 // as is to match.
13869 if (ARM::isBitFieldInvertedMask(Mask) &&
13870 (Mask == ~Mask2)) {
13871 // The pack halfword instruction works better for masks that fit it,
13872 // so use that when it's available.
13873 if (Subtarget->hasDSP() &&
13874 (Mask == 0xffff || Mask == 0xffff0000))
13875 return SDValue();
13876 // 2a
13877 unsigned amt = countTrailingZeros(Mask2);
13878 Res = DAG.getNode(ISD::SRL, DL, VT, N1.getOperand(0),
13879 DAG.getConstant(amt, DL, MVT::i32));
13880 Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, Res,
13881 DAG.getConstant(Mask, DL, MVT::i32));
13882 DCI.CombineTo(N, Res, false);
13883 // Return value from the original node to inform the combiner than N is
13884 // now dead.
13885 return SDValue(N, 0);
13886 } else if (ARM::isBitFieldInvertedMask(~Mask) &&
13887 (~Mask == Mask2)) {
13888 // The pack halfword instruction works better for masks that fit it,
13889 // so use that when it's available.
13890 if (Subtarget->hasDSP() &&
13891 (Mask2 == 0xffff || Mask2 == 0xffff0000))
13892 return SDValue();
13893 // 2b
13894 unsigned lsb = countTrailingZeros(Mask);
13895 Res = DAG.getNode(ISD::SRL, DL, VT, N00,
13896 DAG.getConstant(lsb, DL, MVT::i32));
13897 Res = DAG.getNode(ARMISD::BFI, DL, VT, N1.getOperand(0), Res,
13898 DAG.getConstant(Mask2, DL, MVT::i32));
13899 DCI.CombineTo(N, Res, false);
13900 // Return value from the original node to inform the combiner than N is
13901 // now dead.
13902 return SDValue(N, 0);
13903 }
13904 }
13905
13906 if (DAG.MaskedValueIsZero(N1, MaskC->getAPIntValue()) &&
13907 N00.getOpcode() == ISD::SHL && isa<ConstantSDNode>(N00.getOperand(1)) &&
13909 // Case (3): or (and (shl A, #shamt), mask), B => ARMbfi B, A, ~mask
13910 // where lsb(mask) == #shamt and masked bits of B are known zero.
13911 SDValue ShAmt = N00.getOperand(1);
13912 unsigned ShAmtC = cast<ConstantSDNode>(ShAmt)->getZExtValue();
13913 unsigned LSB = countTrailingZeros(Mask);
13914 if (ShAmtC != LSB)
13915 return SDValue();
13916
13917 Res = DAG.getNode(ARMISD::BFI, DL, VT, N1, N00.getOperand(0),
13918 DAG.getConstant(~Mask, DL, MVT::i32));
13919
13920 DCI.CombineTo(N, Res, false);
13921 // Return value from the original node to inform the combiner than N is
13922 // now dead.
13923 return SDValue(N, 0);
13924 }
13925
13926 return SDValue();
13927}
13928
13929static bool isValidMVECond(unsigned CC, bool IsFloat) {
13930 switch (CC) {
13931 case ARMCC::EQ:
13932 case ARMCC::NE:
13933 case ARMCC::LE:
13934 case ARMCC::GT:
13935 case ARMCC::GE:
13936 case ARMCC::LT:
13937 return true;
13938 case ARMCC::HS:
13939 case ARMCC::HI:
13940 return !IsFloat;
13941 default:
13942 return false;
13943 };
13944}
13945
13947 if (N->getOpcode() == ARMISD::VCMP)
13948 return (ARMCC::CondCodes)N->getConstantOperandVal(2);
13949 else if (N->getOpcode() == ARMISD::VCMPZ)
13950 return (ARMCC::CondCodes)N->getConstantOperandVal(1);
13951 else
13952 llvm_unreachable("Not a VCMP/VCMPZ!");
13953}
13954
13957 return isValidMVECond(CC, N->getOperand(0).getValueType().isFloatingPoint());
13958}
13959
13961 const ARMSubtarget *Subtarget) {
13962 // Try to invert "or A, B" -> "and ~A, ~B", as the "and" is easier to chain
13963 // together with predicates
13964 EVT VT = N->getValueType(0);
13965 SDLoc DL(N);
13966 SDValue N0 = N->getOperand(0);
13967 SDValue N1 = N->getOperand(1);
13968
13969 auto IsFreelyInvertable = [&](SDValue V) {
13970 if (V->getOpcode() == ARMISD::VCMP || V->getOpcode() == ARMISD::VCMPZ)
13971 return CanInvertMVEVCMP(V);
13972 return false;
13973 };
13974
13975 // At least one operand must be freely invertable.
13977 return SDValue();
13978
13979 SDValue NewN0 = DAG.getLogicalNOT(DL, N0, VT);
13980 SDValue NewN1 = DAG.getLogicalNOT(DL, N1, VT);
13981 SDValue And = DAG.getNode(ISD::AND, DL, VT, NewN0, NewN1);
13982 return DAG.getLogicalNOT(DL, And, VT);
13983}
13984
13985/// PerformORCombine - Target-specific dag combine xforms for ISD::OR
13988 const ARMSubtarget *Subtarget) {
13989 // Attempt to use immediate-form VORR
13991 SDLoc dl(N);
13992 EVT VT = N->getValueType(0);
13993 SelectionDAG &DAG = DCI.DAG;
13994
13995 if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
13996 return SDValue();
13997
13998 if (Subtarget->hasMVEIntegerOps() &&
13999 (VT == MVT::v4i1 || VT == MVT::v8i1 || VT == MVT::v16i1))
14000 return PerformORCombine_i1(N, DAG, Subtarget);
14001
14002 APInt SplatBits, SplatUndef;
14003 unsigned SplatBitSize;
14004 bool HasAnyUndefs;
14005 if (BVN && (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) &&
14006 BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
14007 if (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32 ||
14008 SplatBitSize == 64) {
14009 EVT VorrVT;
14010 SDValue Val =
14011 isVMOVModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(),
14012 SplatBitSize, DAG, dl, VorrVT, VT, OtherModImm);
14013 if (Val.getNode()) {
14014 SDValue Input =
14015 DAG.getNode(ISD::BITCAST, dl, VorrVT, N->getOperand(0));
14016 SDValue Vorr = DAG.getNode(ARMISD::VORRIMM, dl, VorrVT, Input, Val);
14017 return DAG.getNode(ISD::BITCAST, dl, VT, Vorr);
14018 }
14019 }
14020 }
14021
14022 if (!Subtarget->isThumb1Only()) {
14023 // fold (or (select cc, 0, c), x) -> (select cc, x, (or, x, c))
14024 if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI))
14025 return Result;
14026 if (SDValue Result = PerformORCombineToSMULWBT(N, DCI, Subtarget))
14027 return Result;
14028 }
14029
14030 SDValue N0 = N->getOperand(0);
14031 SDValue N1 = N->getOperand(1);
14032
14033 // (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant.
14034 if (Subtarget->hasNEON() && N1.getOpcode() == ISD::AND && VT.isVector() &&
14036
14037 // The code below optimizes (or (and X, Y), Z).
14038 // The AND operand needs to have a single user to make these optimizations
14039 // profitable.
14040 if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
14041 return SDValue();
14042
14043 APInt SplatUndef;
14044 unsigned SplatBitSize;
14045 bool HasAnyUndefs;
14046
14050 // Ensure that the second operand of both ands are constants
14051 if (BVN0 && BVN0->isConstantSplat(SplatBits0, SplatUndef, SplatBitSize,
14053 if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize,
14055 // Ensure that the bit width of the constants are the same and that
14056 // the splat arguments are logical inverses as per the pattern we
14057 // are trying to simplify.
14058 if (SplatBits0.getBitWidth() == SplatBits1.getBitWidth() &&
14059 SplatBits0 == ~SplatBits1) {
14060 // Canonicalize the vector type to make instruction selection
14061 // simpler.
14063 SDValue Result = DAG.getNode(ARMISD::VBSP, dl, CanonicalVT,
14064 N0->getOperand(1),
14065 N0->getOperand(0),
14066 N1->getOperand(0));
14067 return DAG.getNode(ISD::BITCAST, dl, VT, Result);
14068 }
14069 }
14070 }
14071 }
14072
14073 // Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when
14074 // reasonable.
14075 if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
14076 if (SDValue Res = PerformORCombineToBFI(N, DCI, Subtarget))
14077 return Res;
14078 }
14079
14080 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
14081 return Result;
14082
14083 return SDValue();
14084}
14085
14088 const ARMSubtarget *Subtarget) {
14089 EVT VT = N->getValueType(0);
14090 SelectionDAG &DAG = DCI.DAG;
14091
14092 if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
14093 return SDValue();
14094
14095 if (!Subtarget->isThumb1Only()) {
14096 // fold (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c))
14097 if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI))
14098 return Result;
14099
14100 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
14101 return Result;
14102 }
14103
14104 if (Subtarget->hasMVEIntegerOps()) {
14105 // fold (xor(vcmp/z, 1)) into a vcmp with the opposite condition.
14106 SDValue N0 = N->getOperand(0);
14107 SDValue N1 = N->getOperand(1);
14108 const TargetLowering *TLI = Subtarget->getTargetLowering();
14109 if (TLI->isConstTrueVal(N1.getNode()) &&
14110 (N0->getOpcode() == ARMISD::VCMP || N0->getOpcode() == ARMISD::VCMPZ)) {
14111 if (CanInvertMVEVCMP(N0)) {
14112 SDLoc DL(N0);
14114
14116 Ops.push_back(N0->getOperand(0));
14117 if (N0->getOpcode() == ARMISD::VCMP)
14118 Ops.push_back(N0->getOperand(1));
14119 Ops.push_back(DAG.getConstant(CC, DL, MVT::i32));
14120 return DAG.getNode(N0->getOpcode(), DL, N0->getValueType(0), Ops);
14121 }
14122 }
14123 }
14124
14125 return SDValue();
14126}
14127
14128// ParseBFI - given a BFI instruction in N, extract the "from" value (Rn) and return it,
14129// and fill in FromMask and ToMask with (consecutive) bits in "from" to be extracted and
14130// their position in "to" (Rd).
14132 assert(N->getOpcode() == ARMISD::BFI);
14133
14134 SDValue From = N->getOperand(1);
14135 ToMask = ~cast<ConstantSDNode>(N->getOperand(2))->getAPIntValue();
14136 FromMask = APInt::getLowBitsSet(ToMask.getBitWidth(), ToMask.countPopulation());
14137
14138 // If the Base came from a SHR #C, we can deduce that it is really testing bit
14139 // #C in the base of the SHR.
14140 if (From->getOpcode() == ISD::SRL &&
14141 isa<ConstantSDNode>(From->getOperand(1))) {
14142 APInt Shift = cast<ConstantSDNode>(From->getOperand(1))->getAPIntValue();
14143 assert(Shift.getLimitedValue() < 32 && "Shift too large!");
14144 FromMask <<= Shift.getLimitedValue(31);
14145 From = From->getOperand(0);
14146 }
14147
14148 return From;
14149}
14150
14151// If A and B contain one contiguous set of bits, does A | B == A . B?
14152//
14153// Neither A nor B must be zero.
14154static bool BitsProperlyConcatenate(const APInt &A, const APInt &B) {
14155 unsigned LastActiveBitInA = A.countTrailingZeros();
14156 unsigned FirstActiveBitInB = B.getBitWidth() - B.countLeadingZeros() - 1;
14157 return LastActiveBitInA - 1 == FirstActiveBitInB;
14158}
14159
14161 // We have a BFI in N. Find a BFI it can combine with, if one exists.
14164 SDValue To = N->getOperand(0);
14165
14166 SDValue V = To;
14167 if (V.getOpcode() != ARMISD::BFI)
14168 return SDValue();
14169
14171 SDValue NewFrom = ParseBFI(V.getNode(), NewToMask, NewFromMask);
14172 if (NewFrom != From)
14173 return SDValue();
14174
14175 // Do the written bits conflict with any we've seen so far?
14176 if ((NewToMask & ToMask).getBoolValue())
14177 // Conflicting bits.
14178 return SDValue();
14179
14180 // Are the new bits contiguous when combined with the old bits?
14183 return V;
14186 return V;
14187
14188 return SDValue();
14189}
14190
14192 SDValue N0 = N->getOperand(0);
14193 SDValue N1 = N->getOperand(1);
14194
14195 if (N1.getOpcode() == ISD::AND) {
14196 // (bfi A, (and B, Mask1), Mask2) -> (bfi A, B, Mask2) iff
14197 // the bits being cleared by the AND are not demanded by the BFI.
14199 if (!N11C)
14200 return SDValue();
14201 unsigned InvMask = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
14202 unsigned LSB = countTrailingZeros(~InvMask);
14203 unsigned Width = (32 - countLeadingZeros(~InvMask)) - LSB;
14204 assert(Width <
14205 static_cast<unsigned>(std::numeric_limits<unsigned>::digits) &&
14206 "undefined behavior");
14207 unsigned Mask = (1u << Width) - 1;
14208 unsigned Mask2 = N11C->getZExtValue();
14209 if ((Mask & (~Mask2)) == 0)
14210 return DAG.getNode(ARMISD::BFI, SDLoc(N), N->getValueType(0),
14211 N->getOperand(0), N1.getOperand(0), N->getOperand(2));
14212 return SDValue();
14213 }
14214
14215 // Look for another BFI to combine with.
14217 // We've found a BFI.
14220
14223 assert(From1 == From2);
14224 (void)From2;
14225
14226 // Create a new BFI, combining the two together.
14229
14230 EVT VT = N->getValueType(0);
14231 SDLoc dl(N);
14232
14233 if (NewFromMask[0] == 0)
14234 From1 = DAG.getNode(
14235 ISD::SRL, dl, VT, From1,
14236 DAG.getConstant(NewFromMask.countTrailingZeros(), dl, VT));
14237 return DAG.getNode(ARMISD::BFI, dl, VT, CombineBFI.getOperand(0), From1,
14238 DAG.getConstant(~NewToMask, dl, VT));
14239 }
14240
14241 // Reassociate BFI(BFI (A, B, M1), C, M2) to BFI(BFI (A, C, M2), B, M1) so
14242 // that lower bit insertions are performed first, providing that M1 and M2
14243 // do no overlap. This can allow multiple BFI instructions to be combined
14244 // together by the other folds above.
14245 if (N->getOperand(0).getOpcode() == ARMISD::BFI) {
14246 APInt ToMask1 = ~N->getConstantOperandAPInt(2);
14247 APInt ToMask2 = ~N0.getConstantOperandAPInt(2);
14248
14249 if (!N0.hasOneUse() || (ToMask1 & ToMask2) != 0 ||
14250 ToMask1.countLeadingZeros() < ToMask2.countLeadingZeros())
14251 return SDValue();
14252
14253 EVT VT = N->getValueType(0);
14254 SDLoc dl(N);
14255 SDValue BFI1 = DAG.getNode(ARMISD::BFI, dl, VT, N0.getOperand(0),
14256 N->getOperand(1), N->getOperand(2));
14257 return DAG.getNode(ARMISD::BFI, dl, VT, BFI1, N0.getOperand(1),
14258 N0.getOperand(2));
14259 }
14260
14261 return SDValue();
14262}
14263
14264/// PerformVMOVRRDCombine - Target-specific dag combine xforms for
14265/// ARMISD::VMOVRRD.
14268 const ARMSubtarget *Subtarget) {
14269 // vmovrrd(vmovdrr x, y) -> x,y
14270 SDValue InDouble = N->getOperand(0);
14271 if (InDouble.getOpcode() == ARMISD::VMOVDRR && Subtarget->hasFP64())
14272 return DCI.CombineTo(N, InDouble.getOperand(0), InDouble.getOperand(1));
14273
14274 // vmovrrd(load f64) -> (load i32), (load i32)
14275 SDNode *InNode = InDouble.getNode();
14276 if (ISD::isNormalLoad(InNode) && InNode->hasOneUse() &&
14277 InNode->getValueType(0) == MVT::f64 &&
14278 InNode->getOperand(1).getOpcode() == ISD::FrameIndex &&
14279 !cast<LoadSDNode>(InNode)->isVolatile()) {
14280 // TODO: Should this be done for non-FrameIndex operands?
14282
14283 SelectionDAG &DAG = DCI.DAG;
14284 SDLoc DL(LD);
14285 SDValue BasePtr = LD->getBasePtr();
14286 SDValue NewLD1 =
14287 DAG.getLoad(MVT::i32, DL, LD->getChain(), BasePtr, LD->getPointerInfo(),
14288 LD->getAlignment(), LD->getMemOperand()->getFlags());
14289
14290 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
14291 DAG.getConstant(4, DL, MVT::i32));
14292
14293 SDValue NewLD2 = DAG.getLoad(MVT::i32, DL, LD->getChain(), OffsetPtr,
14294 LD->getPointerInfo().getWithOffset(4),
14295 std::min(4U, LD->getAlignment()),
14296 LD->getMemOperand()->getFlags());
14297
14298 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLD2.getValue(1));
14299 if (DCI.DAG.getDataLayout().isBigEndian())
14301 SDValue Result = DCI.CombineTo(N, NewLD1, NewLD2);
14302 return Result;
14303 }
14304
14305 // VMOVRRD(extract(..(build_vector(a, b, c, d)))) -> a,b or c,d
14306 // VMOVRRD(extract(insert_vector(insert_vector(.., a, l1), b, l2))) -> a,b
14307 if (InDouble.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
14308 isa<ConstantSDNode>(InDouble.getOperand(1))) {
14309 SDValue BV = InDouble.getOperand(0);
14310 // Look up through any nop bitcasts and vector_reg_casts. bitcasts may
14311 // change lane order under big endian.
14312 bool BVSwap = BV.getOpcode() == ISD::BITCAST;
14313 while (
14314 (BV.getOpcode() == ISD::BITCAST ||
14315 BV.getOpcode() == ARMISD::VECTOR_REG_CAST) &&
14316 (BV.getValueType() == MVT::v2f64 || BV.getValueType() == MVT::v2i64)) {
14317 BVSwap = BV.getOpcode() == ISD::BITCAST;
14318 BV = BV.getOperand(0);
14319 }
14320 if (BV.getValueType() != MVT::v4i32)
14321 return SDValue();
14322
14323 // Handle buildvectors, pulling out the correct lane depending on
14324 // endianness.
14325 unsigned Offset = InDouble.getConstantOperandVal(1) == 1 ? 2 : 0;
14326 if (BV.getOpcode() == ISD::BUILD_VECTOR) {
14327 SDValue Op0 = BV.getOperand(Offset);
14328 SDValue Op1 = BV.getOperand(Offset + 1);
14329 if (!Subtarget->isLittle() && BVSwap)
14330 std::swap(Op0, Op1);
14331
14332 return DCI.DAG.getMergeValues({Op0, Op1}, SDLoc(N));
14333 }
14334
14335 // A chain of insert_vectors, grabbing the correct value of the chain of
14336 // inserts.
14337 SDValue Op0, Op1;
14338 while (BV.getOpcode() == ISD::INSERT_VECTOR_ELT) {
14339 if (isa<ConstantSDNode>(BV.getOperand(2))) {
14340 if (BV.getConstantOperandVal(2) == Offset)
14341 Op0 = BV.getOperand(1);
14342 if (BV.getConstantOperandVal(2) == Offset + 1)
14343 Op1 = BV.getOperand(1);
14344 }
14345 BV = BV.getOperand(0);
14346 }
14347 if (!Subtarget->isLittle() && BVSwap)
14348 std::swap(Op0, Op1);
14349 if (Op0 && Op1)
14350 return DCI.DAG.getMergeValues({Op0, Op1}, SDLoc(N));
14351 }
14352
14353 return SDValue();
14354}
14355
14356/// PerformVMOVDRRCombine - Target-specific dag combine xforms for
14357/// ARMISD::VMOVDRR. This is also used for BUILD_VECTORs with 2 operands.
14359 // N=vmovrrd(X); vmovdrr(N:0, N:1) -> bit_convert(X)
14360 SDValue Op0 = N->getOperand(0);
14361 SDValue Op1 = N->getOperand(1);
14362 if (Op0.getOpcode() == ISD::BITCAST)
14363 Op0 = Op0.getOperand(0);
14364 if (Op1.getOpcode() == ISD::BITCAST)
14365 Op1 = Op1.getOperand(0);
14366 if (Op0.getOpcode() == ARMISD::VMOVRRD &&
14367 Op0.getNode() == Op1.getNode() &&
14368 Op0.getResNo() == 0 && Op1.getResNo() == 1)
14369 return DAG.getNode(ISD::BITCAST, SDLoc(N),
14370 N->getValueType(0), Op0.getOperand(0));
14371 return SDValue();
14372}
14373
14376 SDValue Op0 = N->getOperand(0);
14377
14378 // VMOVhr (VMOVrh (X)) -> X
14379 if (Op0->getOpcode() == ARMISD::VMOVrh)
14380 return Op0->getOperand(0);
14381
14382 // FullFP16: half values are passed in S-registers, and we don't
14383 // need any of the bitcast and moves:
14384 //
14385 // t2: f32,ch = CopyFromReg t0, Register:f32 %0
14386 // t5: i32 = bitcast t2
14387 // t18: f16 = ARMISD::VMOVhr t5
14388 if (Op0->getOpcode() == ISD::BITCAST) {
14389 SDValue Copy = Op0->getOperand(0);
14390 if (Copy.getValueType() == MVT::f32 &&
14391 Copy->getOpcode() == ISD::CopyFromReg) {
14392 SDValue Ops[] = {Copy->getOperand(0), Copy->getOperand(1)};
14394 DCI.DAG.getNode(ISD::CopyFromReg, SDLoc(N), N->getValueType(0), Ops);
14395 return NewCopy;
14396 }
14397 }
14398
14399 // fold (VMOVhr (load x)) -> (load (f16*)x)
14400 if (LoadSDNode *LN0 = dyn_cast<LoadSDNode>(Op0)) {
14401 if (LN0->hasOneUse() && LN0->isUnindexed() &&
14402 LN0->getMemoryVT() == MVT::i16) {
14403 SDValue Load =
14404 DCI.DAG.getLoad(N->getValueType(0), SDLoc(N), LN0->getChain(),
14405 LN0->getBasePtr(), LN0->getMemOperand());
14406 DCI.DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Load.getValue(0));
14407 DCI.DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Load.getValue(1));
14408 return Load;
14409 }
14410 }
14411
14412 // Only the bottom 16 bits of the source register are used.
14414 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
14415 if (TLI.SimplifyDemandedBits(Op0, DemandedMask, DCI))
14416 return SDValue(N, 0);
14417
14418 return SDValue();
14419}
14420
14422 SDValue N0 = N->getOperand(0);
14423 EVT VT = N->getValueType(0);
14424
14425 // fold (VMOVrh (fpconst x)) -> const x
14427 APFloat V = C->getValueAPF();
14428 return DAG.getConstant(V.bitcastToAPInt().getZExtValue(), SDLoc(N), VT);
14429 }
14430
14431 // fold (VMOVrh (load x)) -> (zextload (i16*)x)
14432 if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse()) {
14434
14435 SDValue Load =
14436 DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N), VT, LN0->getChain(),
14437 LN0->getBasePtr(), MVT::i16, LN0->getMemOperand());
14438 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Load.getValue(0));
14439 DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1));
14440 return Load;
14441 }
14442
14443 // Fold VMOVrh(extract(x, n)) -> vgetlaneu(x, n)
14444 if (N0->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
14446 return DAG.getNode(ARMISD::VGETLANEu, SDLoc(N), VT, N0->getOperand(0),
14447 N0->getOperand(1));
14448
14449 return SDValue();
14450}
14451
14452/// hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node
14453/// are normal, non-volatile loads. If so, it is profitable to bitcast an
14454/// i64 vector to have f64 elements, since the value can then be loaded
14455/// directly into a VFP register.
14457 unsigned NumElts = N->getValueType(0).getVectorNumElements();
14458 for (unsigned i = 0; i < NumElts; ++i) {
14459 SDNode *Elt = N->getOperand(i).getNode();
14460 if (ISD::isNormalLoad(Elt) && !cast<LoadSDNode>(Elt)->isVolatile())
14461 return true;
14462 }
14463 return false;
14464}
14465
14466/// PerformBUILD_VECTORCombine - Target-specific dag combine xforms for
14467/// ISD::BUILD_VECTOR.
14470 const ARMSubtarget *Subtarget) {
14471 // build_vector(N=ARMISD::VMOVRRD(X), N:1) -> bit_convert(X):
14472 // VMOVRRD is introduced when legalizing i64 types. It forces the i64 value
14473 // into a pair of GPRs, which is fine when the value is used as a scalar,
14474 // but if the i64 value is converted to a vector, we need to undo the VMOVRRD.
14475 SelectionDAG &DAG = DCI.DAG;
14476 if (N->getNumOperands() == 2)
14477 if (SDValue RV = PerformVMOVDRRCombine(N, DAG))
14478 return RV;
14479
14480 // Load i64 elements as f64 values so that type legalization does not split
14481 // them up into i32 values.
14482 EVT VT = N->getValueType(0);
14484 return SDValue();
14485 SDLoc dl(N);
14487 unsigned NumElts = VT.getVectorNumElements();
14488 for (unsigned i = 0; i < NumElts; ++i) {
14489 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(i));
14490 Ops.push_back(V);
14491 // Make the DAGCombiner fold the bitcast.
14492 DCI.AddToWorklist(V.getNode());
14493 }
14494 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, NumElts);
14495 SDValue BV = DAG.getBuildVector(FloatVT, dl, Ops);
14496 return DAG.getNode(ISD::BITCAST, dl, VT, BV);
14497}
14498
14499/// Target-specific dag combine xforms for ARMISD::BUILD_VECTOR.
14500static SDValue
14502 // ARMISD::BUILD_VECTOR is introduced when legalizing ISD::BUILD_VECTOR.
14503 // At that time, we may have inserted bitcasts from integer to float.
14504 // If these bitcasts have survived DAGCombine, change the lowering of this
14505 // BUILD_VECTOR in something more vector friendly, i.e., that does not
14506 // force to use floating point types.
14507
14508 // Make sure we can change the type of the vector.
14509 // This is possible iff:
14510 // 1. The vector is only used in a bitcast to a integer type. I.e.,
14511 // 1.1. Vector is used only once.
14512 // 1.2. Use is a bit convert to an integer type.
14513 // 2. The size of its operands are 32-bits (64-bits are not legal).
14514 EVT VT = N->getValueType(0);
14516
14517 // Check 1.1. and 2.
14518 if (EltVT.getSizeInBits() != 32 || !N->hasOneUse())
14519 return SDValue();
14520
14521 // By construction, the input type must be float.
14522 assert(EltVT == MVT::f32 && "Unexpected type!");
14523
14524 // Check 1.2.
14525 SDNode *Use = *N->use_begin();
14526 if (Use->getOpcode() != ISD::BITCAST ||
14527 Use->getValueType(0).isFloatingPoint())
14528 return SDValue();
14529
14530 // Check profitability.
14531 // Model is, if more than half of the relevant operands are bitcast from
14532 // i32, turn the build_vector into a sequence of insert_vector_elt.
14533 // Relevant operands are everything that is not statically
14534 // (i.e., at compile time) bitcasted.
14535 unsigned NumOfBitCastedElts = 0;
14536 unsigned NumElts = VT.getVectorNumElements();
14537 unsigned NumOfRelevantElts = NumElts;
14538 for (unsigned Idx = 0; Idx < NumElts; ++Idx) {
14539 SDValue Elt = N->getOperand(Idx);
14540 if (Elt->getOpcode() == ISD::BITCAST) {
14541 // Assume only bit cast to i32 will go away.
14542 if (Elt->getOperand(0).getValueType() == MVT::i32)
14544 } else if (Elt.isUndef() || isa<ConstantSDNode>(Elt))
14545 // Constants are statically casted, thus do not count them as
14546 // relevant operands.
14548 }
14549
14550 // Check if more than half of the elements require a non-free bitcast.
14552 return SDValue();
14553
14554 SelectionDAG &DAG = DCI.DAG;
14555 // Create the new vector type.
14557 // Check if the type is legal.
14558 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14559 if (!TLI.isTypeLegal(VecVT))
14560 return SDValue();
14561
14562 // Combine:
14563 // ARMISD::BUILD_VECTOR E1, E2, ..., EN.
14564 // => BITCAST INSERT_VECTOR_ELT
14565 // (INSERT_VECTOR_ELT (...), (BITCAST EN-1), N-1),
14566 // (BITCAST EN), N.
14567 SDValue Vec = DAG.getUNDEF(VecVT);
14568 SDLoc dl(N);
14569 for (unsigned Idx = 0 ; Idx < NumElts; ++Idx) {
14570 SDValue V = N->getOperand(Idx);
14571 if (V.isUndef())
14572 continue;
14573 if (V.getOpcode() == ISD::BITCAST &&
14574 V->getOperand(0).getValueType() == MVT::i32)
14575 // Fold obvious case.
14576 V = V.getOperand(0);
14577 else {
14578 V = DAG.getNode(ISD::BITCAST, SDLoc(V), MVT::i32, V);
14579 // Make the DAGCombiner fold the bitcasts.
14580 DCI.AddToWorklist(V.getNode());
14581 }
14583 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecVT, Vec, V, LaneIdx);
14584 }
14585 Vec = DAG.getNode(ISD::BITCAST, dl, VT, Vec);
14586 // Make the DAGCombiner fold the bitcasts.
14587 DCI.AddToWorklist(Vec.getNode());
14588 return Vec;
14589}
14590
14591static SDValue
14593 EVT VT = N->getValueType(0);
14594 SDValue Op = N->getOperand(0);
14595 SDLoc dl(N);
14596
14597 // PREDICATE_CAST(PREDICATE_CAST(x)) == PREDICATE_CAST(x)
14598 if (Op->getOpcode() == ARMISD::PREDICATE_CAST) {
14599 // If the valuetypes are the same, we can remove the cast entirely.
14600 if (Op->getOperand(0).getValueType() == VT)
14601 return Op->getOperand(0);
14602 return DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, Op->getOperand(0));
14603 }
14604
14605 // Turn pred_cast(xor x, -1) into xor(pred_cast x, -1), in order to produce
14606 // more VPNOT which might get folded as else predicates.
14607 if (Op.getValueType() == MVT::i32 && isBitwiseNot(Op)) {
14608 SDValue X =
14609 DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, Op->getOperand(0));
14610 SDValue C = DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT,
14611 DCI.DAG.getConstant(65535, dl, MVT::i32));
14612 return DCI.DAG.getNode(ISD::XOR, dl, VT, X, C);
14613 }
14614
14615 // Only the bottom 16 bits of the source register are used.
14616 if (Op.getValueType() == MVT::i32) {
14618 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
14620 return SDValue(N, 0);
14621 }
14622 return SDValue();
14623}
14624
14626 const ARMSubtarget *ST) {
14627 EVT VT = N->getValueType(0);
14628 SDValue Op = N->getOperand(0);
14629 SDLoc dl(N);
14630
14631 // Under Little endian, a VECTOR_REG_CAST is equivalent to a BITCAST
14632 if (ST->isLittle())
14633 return DAG.getNode(ISD::BITCAST, dl, VT, Op);
14634
14635 // VECTOR_REG_CAST undef -> undef
14636 if (Op.isUndef())
14637 return DAG.getUNDEF(VT);
14638
14639 // VECTOR_REG_CAST(VECTOR_REG_CAST(x)) == VECTOR_REG_CAST(x)
14640 if (Op->getOpcode() == ARMISD::VECTOR_REG_CAST) {
14641 // If the valuetypes are the same, we can remove the cast entirely.
14642 if (Op->getOperand(0).getValueType() == VT)
14643 return Op->getOperand(0);
14644 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Op->getOperand(0));
14645 }
14646
14647 return SDValue();
14648}
14649
14651 const ARMSubtarget *Subtarget) {
14652 if (!Subtarget->hasMVEIntegerOps())
14653 return SDValue();
14654
14655 EVT VT = N->getValueType(0);
14656 SDValue Op0 = N->getOperand(0);
14657 SDValue Op1 = N->getOperand(1);
14659 (ARMCC::CondCodes)cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
14660 SDLoc dl(N);
14661
14662 // vcmp X, 0, cc -> vcmpz X, cc
14663 if (isZeroVector(Op1))
14664 return DAG.getNode(ARMISD::VCMPZ, dl, VT, Op0, N->getOperand(2));
14665
14668 // vcmp 0, X, cc -> vcmpz X, reversed(cc)
14669 if (isZeroVector(Op0))
14670 return DAG.getNode(ARMISD::VCMPZ, dl, VT, Op1,
14671 DAG.getConstant(SwappedCond, dl, MVT::i32));
14672 // vcmp vdup(Y), X, cc -> vcmp X, vdup(Y), reversed(cc)
14673 if (Op0->getOpcode() == ARMISD::VDUP && Op1->getOpcode() != ARMISD::VDUP)
14674 return DAG.getNode(ARMISD::VCMP, dl, VT, Op1, Op0,
14675 DAG.getConstant(SwappedCond, dl, MVT::i32));
14676 }
14677
14678 return SDValue();
14679}
14680
14681/// PerformInsertEltCombine - Target-specific dag combine xforms for
14682/// ISD::INSERT_VECTOR_ELT.
14685 // Bitcast an i64 load inserted into a vector to f64.
14686 // Otherwise, the i64 value will be legalized to a pair of i32 values.
14687 EVT VT = N->getValueType(0);
14688 SDNode *Elt = N->getOperand(1).getNode();
14689 if (VT.getVectorElementType() != MVT::i64 ||
14690 !ISD::isNormalLoad(Elt) || cast<LoadSDNode>(Elt)->isVolatile())
14691 return SDValue();
14692
14693 SelectionDAG &DAG = DCI.DAG;
14694 SDLoc dl(N);
14695 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
14697 SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, N->getOperand(0));
14698 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(1));
14699 // Make the DAGCombiner fold the bitcasts.
14700 DCI.AddToWorklist(Vec.getNode());
14701 DCI.AddToWorklist(V.getNode());
14702 SDValue InsElt = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, FloatVT,
14703 Vec, V, N->getOperand(2));
14704 return DAG.getNode(ISD::BITCAST, dl, VT, InsElt);
14705}
14706
14707// Convert a pair of extracts from the same base vector to a VMOVRRD. Either
14708// directly or bitcast to an integer if the original is a float vector.
14709// extract(x, n); extract(x, n+1) -> VMOVRRD(extract v2f64 x, n/2)
14710// bitcast(extract(x, n)); bitcast(extract(x, n+1)) -> VMOVRRD(extract x, n/2)
14711static SDValue
14713 EVT VT = N->getValueType(0);
14714 SDLoc dl(N);
14715
14716 if (!DCI.isAfterLegalizeDAG() || VT != MVT::i32 ||
14717 !DCI.DAG.getTargetLoweringInfo().isTypeLegal(MVT::f64))
14718 return SDValue();
14719
14720 SDValue Ext = SDValue(N, 0);
14721 if (Ext.getOpcode() == ISD::BITCAST &&
14722 Ext.getOperand(0).getValueType() == MVT::f32)
14723 Ext = Ext.getOperand(0);
14724 if (Ext.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
14725 !isa<ConstantSDNode>(Ext.getOperand(1)) ||
14726 Ext.getConstantOperandVal(1) % 2 != 0)
14727 return SDValue();
14728 if (Ext->use_size() == 1 &&
14729 (Ext->use_begin()->getOpcode() == ISD::SINT_TO_FP ||
14730 Ext->use_begin()->getOpcode() == ISD::UINT_TO_FP))
14731 return SDValue();
14732
14733 SDValue Op0 = Ext.getOperand(0);
14734 EVT VecVT = Op0.getValueType();
14735 unsigned Lane = Ext.getConstantOperandVal(1);
14736 if (VecVT.getVectorNumElements() != 4)
14737 return SDValue();
14738
14739 // Find another extract, of Lane + 1
14740 auto OtherIt = find_if(Op0->uses(), [&](SDNode *V) {
14741 return V->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
14742 isa<ConstantSDNode>(V->getOperand(1)) &&
14743 V->getConstantOperandVal(1) == Lane + 1;
14744 });
14745 if (OtherIt == Op0->uses().end())
14746 return SDValue();
14747
14748 // For float extracts, we need to be converting to a i32 for both vector
14749 // lanes.
14751 if (OtherExt.getValueType() != MVT::i32) {
14752 if (OtherExt->use_size() != 1 ||
14753 OtherExt->use_begin()->getOpcode() != ISD::BITCAST ||
14754 OtherExt->use_begin()->getValueType(0) != MVT::i32)
14755 return SDValue();
14756 OtherExt = SDValue(*OtherExt->use_begin(), 0);
14757 }
14758
14759 // Convert the type to a f64 and extract with a VMOVRRD.
14760 SDValue F64 = DCI.DAG.getNode(
14762 DCI.DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v2f64, Op0),
14763 DCI.DAG.getConstant(Ext.getConstantOperandVal(1) / 2, dl, MVT::i32));
14764 SDValue VMOVRRD =
14765 DCI.DAG.getNode(ARMISD::VMOVRRD, dl, {MVT::i32, MVT::i32}, F64);
14766
14767 DCI.CombineTo(OtherExt.getNode(), SDValue(VMOVRRD.getNode(), 1));
14768 return VMOVRRD;
14769}
14770
14773 const ARMSubtarget *ST) {
14774 SDValue Op0 = N->getOperand(0);
14775 EVT VT = N->getValueType(0);
14776 SDLoc dl(N);
14777
14778 // extract (vdup x) -> x
14779 if (Op0->getOpcode() == ARMISD::VDUP) {
14780 SDValue X = Op0->getOperand(0);
14781 if (VT == MVT::f16 && X.getValueType() == MVT::i32)
14782 return DCI.DAG.getNode(ARMISD::VMOVhr, dl, VT, X);
14783 if (VT == MVT::i32 && X.getValueType() == MVT::f16)
14784 return DCI.DAG.getNode(ARMISD::VMOVrh, dl, VT, X);
14785 if (VT == MVT::f32 && X.getValueType() == MVT::i32)
14786 return DCI.DAG.getNode(ISD::BITCAST, dl, VT, X);
14787
14788 while (X.getValueType() != VT && X->getOpcode() == ISD::BITCAST)
14789 X = X->getOperand(0);
14790 if (X.getValueType() == VT)
14791 return X;
14792 }
14793
14794 // extract ARM_BUILD_VECTOR -> x
14795 if (Op0->getOpcode() == ARMISD::BUILD_VECTOR &&
14796 isa<ConstantSDNode>(N->getOperand(1)) &&
14797 N->getConstantOperandVal(1) < Op0.getNumOperands()) {
14798 return Op0.getOperand(N->getConstantOperandVal(1));
14799 }
14800
14801 // extract(bitcast(BUILD_VECTOR(VMOVDRR(a, b), ..))) -> a or b
14802 if (Op0.getValueType() == MVT::v4i32 &&
14803 isa<ConstantSDNode>(N->getOperand(1)) &&
14804 Op0.getOpcode() == ISD::BITCAST &&
14806 Op0.getOperand(0).getValueType() == MVT::v2f64) {
14807 SDValue BV = Op0.getOperand(0);
14808 unsigned Offset = N->getConstantOperandVal(1);
14809 SDValue MOV = BV.getOperand(Offset < 2 ? 0 : 1);
14810 if (MOV.getOpcode() == ARMISD::VMOVDRR)
14811 return MOV.getOperand(ST->isLittle() ? Offset % 2 : 1 - Offset % 2);
14812 }
14813
14814 // extract x, n; extract x, n+1 -> VMOVRRD x
14816 return R;
14817
14818 // extract (MVETrunc(x)) -> extract x
14819 if (Op0->getOpcode() == ARMISD::MVETRUNC) {
14820 unsigned Idx = N->getConstantOperandVal(1);
14821 unsigned Vec =
14823 unsigned SubIdx =
14825 return DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Op0.getOperand(Vec),
14826 DCI.DAG.getConstant(SubIdx, dl, MVT::i32));
14827 }
14828
14829 return SDValue();
14830}
14831
14833 SDValue Op = N->getOperand(0);
14834 EVT VT = N->getValueType(0);
14835
14836 // sext_inreg(VGETLANEu) -> VGETLANEs
14837 if (Op.getOpcode() == ARMISD::VGETLANEu &&
14838 cast<VTSDNode>(N->getOperand(1))->getVT() ==
14839 Op.getOperand(0).getValueType().getScalarType())
14840 return DAG.getNode(ARMISD::VGETLANEs, SDLoc(N), VT, Op.getOperand(0),
14841 Op.getOperand(1));
14842
14843 return SDValue();
14844}
14845
14846// When lowering complex nodes that we recognize, like VQDMULH and MULH, we
14847// can end up with shuffle(binop(shuffle, shuffle)), that can be simplified to
14848// binop as the shuffles cancel out.
14850 EVT VT = N->getValueType(0);
14851 if (!N->getOperand(1).isUndef() || N->getOperand(0).getValueType() != VT)
14852 return SDValue();
14853 SDValue Op = N->getOperand(0);
14854
14855 // Looking for binary operators that will have been folded from
14856 // truncates/extends.
14857 switch (Op.getOpcode()) {
14858 case ARMISD::VQDMULH:
14859 case ISD::MULHS:
14860 case ISD::MULHU:
14861 case ISD::ABDS:
14862 case ISD::ABDU:
14863 break;
14864 default:
14865 return SDValue();
14866 }
14867
14868 ShuffleVectorSDNode *Op0 = dyn_cast<ShuffleVectorSDNode>(Op.getOperand(0));
14869 ShuffleVectorSDNode *Op1 = dyn_cast<ShuffleVectorSDNode>(Op.getOperand(1));
14870 if (!Op0 || !Op1 || !Op0->getOperand(1).isUndef() ||
14871 !Op1->getOperand(1).isUndef() || Op0->getMask() != Op1->getMask() ||
14872 Op0->getOperand(0).getValueType() != VT)
14873 return SDValue();
14874
14875 // Check the mask turns into an identity shuffle.
14876 ArrayRef<int> NMask = N->getMask();
14877 ArrayRef<int> OpMask = Op0->getMask();
14878 for (int i = 0, e = NMask.size(); i != e; i++) {
14879 if (NMask[i] > 0 && OpMask[NMask[i]] > 0 && OpMask[NMask[i]] != i)
14880 return SDValue();
14881 }
14882
14883 return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(),
14884 Op0->getOperand(0), Op1->getOperand(0));
14885}
14886
14887// shuffle(MVETrunc(x, y)) -> VMOVN(x, y)
14889 SelectionDAG &DAG) {
14890 SDValue Trunc = N->getOperand(0);
14891 EVT VT = Trunc.getValueType();
14892 if (Trunc.getOpcode() != ARMISD::MVETRUNC || !N->getOperand(1).isUndef())
14893 return SDValue();
14894
14895 SDLoc DL(Trunc);
14896 if (isVMOVNTruncMask(N->getMask(), VT, 0))
14897 return DAG.getNode(
14898 ARMISD::VMOVN, DL, VT,
14899 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(0)),
14900 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(1)),
14901 DAG.getConstant(1, DL, MVT::i32));
14902 else if (isVMOVNTruncMask(N->getMask(), VT, 1))
14903 return DAG.getNode(
14904 ARMISD::VMOVN, DL, VT,
14905 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(1)),
14906 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(0)),
14907 DAG.getConstant(1, DL, MVT::i32));
14908 return SDValue();
14909}
14910
14911/// PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for
14912/// ISD::VECTOR_SHUFFLE.
14915 return R;
14917 return R;
14918
14919 // The LLVM shufflevector instruction does not require the shuffle mask
14920 // length to match the operand vector length, but ISD::VECTOR_SHUFFLE does
14921 // have that requirement. When translating to ISD::VECTOR_SHUFFLE, if the
14922 // operands do not match the mask length, they are extended by concatenating
14923 // them with undef vectors. That is probably the right thing for other
14924 // targets, but for NEON it is better to concatenate two double-register
14925 // size vector operands into a single quad-register size vector. Do that
14926 // transformation here:
14927 // shuffle(concat(v1, undef), concat(v2, undef)) ->
14928 // shuffle(concat(v1, v2), undef)
14929 SDValue Op0 = N->getOperand(0);
14930 SDValue Op1 = N->getOperand(1);
14931 if (Op0.getOpcode() != ISD::CONCAT_VECTORS ||
14932 Op1.getOpcode() != ISD::CONCAT_VECTORS ||
14933 Op0.getNumOperands() != 2 ||
14934 Op1.getNumOperands() != 2)
14935 return SDValue();
14936 SDValue Concat0Op1 = Op0.getOperand(1);
14937 SDValue Concat1Op1 = Op1.getOperand(1);
14938 if (!Concat0Op1.isUndef() || !Concat1Op1.isUndef())
14939 return SDValue();
14940 // Skip the transformation if any of the types are illegal.
14941 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14942 EVT VT = N->getValueType(0);
14943 if (!TLI.isTypeLegal(VT) ||
14944 !TLI.isTypeLegal(Concat0Op1.getValueType()) ||
14945 !TLI.isTypeLegal(Concat1Op1.getValueType()))
14946 return SDValue();
14947
14949 Op0.getOperand(0), Op1.getOperand(0));
14950 // Translate the shuffle mask.
14952 unsigned NumElts = VT.getVectorNumElements();
14953 unsigned HalfElts = NumElts/2;
14955 for (unsigned n = 0; n < NumElts; ++n) {
14956 int MaskElt = SVN->getMaskElt(n);
14957 int NewElt = -1;
14958 if (MaskElt < (int)HalfElts)
14959 NewElt = MaskElt;
14960 else if (MaskElt >= (int)NumElts && MaskElt < (int)(NumElts + HalfElts))
14962 NewMask.push_back(NewElt);
14963 }
14964 return DAG.getVectorShuffle(VT, SDLoc(N), NewConcat,
14965 DAG.getUNDEF(VT), NewMask);
14966}
14967
14968/// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP,
14969/// NEON load/store intrinsics, and generic vector load/stores, to merge
14970/// base address updates.
14971/// For generic load/stores, the memory type is assumed to be a vector.
14972/// The caller is assumed to have checked legality.
14975 SelectionDAG &DAG = DCI.DAG;
14976 const bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID ||
14977 N->getOpcode() == ISD::INTRINSIC_W_CHAIN);
14978 const bool isStore = N->getOpcode() == ISD::STORE;
14979 const unsigned AddrOpIdx = ((isIntrinsic || isStore) ? 2 : 1);
14980 SDValue Addr = N->getOperand(AddrOpIdx);
14982 SDLoc dl(N);
14983
14984 // Search for a use of the address operand that is an increment.
14985 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
14986 UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
14987 SDNode *User = *UI;
14988 if (User->getOpcode() != ISD::ADD ||
14989 UI.getUse().getResNo() != Addr.getResNo())
14990 continue;
14991
14992 // Check that the add is independent of the load/store. Otherwise, folding
14993 // it would create a cycle. We can avoid searching through Addr as it's a
14994 // predecessor to both.
14997 Visited.insert(Addr.getNode());
14998 Worklist.push_back(N);
14999 Worklist.push_back(User);
15000 if (SDNode::hasPredecessorHelper(N, Visited, Worklist) ||
15001 SDNode::hasPredecessorHelper(User, Visited, Worklist))
15002 continue;
15003
15004 // Find the new opcode for the updating load/store.
15005 bool isLoadOp = true;
15006 bool isLaneOp = false;
15007 // Workaround for vst1x and vld1x intrinsics which do not have alignment
15008 // as an operand.
15009 bool hasAlignment = true;
15010 unsigned NewOpc = 0;
15011 unsigned NumVecs = 0;
15012 if (isIntrinsic) {
15013 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
15014 switch (IntNo) {
15015 default: llvm_unreachable("unexpected intrinsic for Neon base update");
15016 case Intrinsic::arm_neon_vld1: NewOpc = ARMISD::VLD1_UPD;
15017 NumVecs = 1; break;
15018 case Intrinsic::arm_neon_vld2: NewOpc = ARMISD::VLD2_UPD;
15019 NumVecs = 2; break;
15020 case Intrinsic::arm_neon_vld3: NewOpc = ARMISD::VLD3_UPD;
15021 NumVecs = 3; break;
15022 case Intrinsic::arm_neon_vld4: NewOpc = ARMISD::VLD4_UPD;
15023 NumVecs = 4; break;
15024 case Intrinsic::arm_neon_vld1x2: NewOpc = ARMISD::VLD1x2_UPD;
15025 NumVecs = 2; hasAlignment = false; break;
15026 case Intrinsic::arm_neon_vld1x3: NewOpc = ARMISD::VLD1x3_UPD;
15027 NumVecs = 3; hasAlignment = false; break;
15028 case Intrinsic::arm_neon_vld1x4: NewOpc = ARMISD::VLD1x4_UPD;
15029 NumVecs = 4; hasAlignment = false; break;
15030 case Intrinsic::arm_neon_vld2dup: NewOpc = ARMISD::VLD2DUP_UPD;
15031 NumVecs = 2; break;
15032 case Intrinsic::arm_neon_vld3dup: NewOpc = ARMISD::VLD3DUP_UPD;
15033 NumVecs = 3; break;
15034 case Intrinsic::arm_neon_vld4dup: NewOpc = ARMISD::VLD4DUP_UPD;
15035 NumVecs = 4; break;
15036 case Intrinsic::arm_neon_vld2lane: NewOpc = ARMISD::VLD2LN_UPD;
15037 NumVecs = 2; isLaneOp = true; break;
15038 case Intrinsic::arm_neon_vld3lane: NewOpc = ARMISD::VLD3LN_UPD;
15039 NumVecs = 3; isLaneOp = true; break;
15040 case Intrinsic::arm_neon_vld4lane: NewOpc = ARMISD::VLD4LN_UPD;
15041 NumVecs = 4; isLaneOp = true; break;
15042 case Intrinsic::arm_neon_vst1: NewOpc = ARMISD::VST1_UPD;
15043 NumVecs = 1; isLoadOp = false; break;
15044 case Intrinsic::arm_neon_vst2: NewOpc = ARMISD::VST2_UPD;
15045 NumVecs = 2; isLoadOp = false; break;
15046 case Intrinsic::arm_neon_vst3: NewOpc = ARMISD::VST3_UPD;
15047 NumVecs = 3; isLoadOp = false; break;
15048 case Intrinsic::arm_neon_vst4: NewOpc = ARMISD::VST4_UPD;
15049 NumVecs = 4; isLoadOp = false; break;
15050 case Intrinsic::arm_neon_vst2lane: NewOpc = ARMISD::VST2LN_UPD;
15051 NumVecs = 2; isLoadOp = false; isLaneOp = true; break;
15052 case Intrinsic::arm_neon_vst3lane: NewOpc = ARMISD::VST3LN_UPD;
15053 NumVecs = 3; isLoadOp = false; isLaneOp = true; break;
15054 case Intrinsic::arm_neon_vst4lane: NewOpc = ARMISD::VST4LN_UPD;
15055 NumVecs = 4; isLoadOp = false; isLaneOp = true; break;
15056 case Intrinsic::arm_neon_vst1x2: NewOpc = ARMISD::VST1x2_UPD;
15057 NumVecs = 2; isLoadOp = false; hasAlignment = false; break;
15058 case Intrinsic::arm_neon_vst1x3: NewOpc = ARMISD::VST1x3_UPD;
15059 NumVecs = 3; isLoadOp = false; hasAlignment = false; break;
15060 case Intrinsic::arm_neon_vst1x4: NewOpc = ARMISD::VST1x4_UPD;
15061 NumVecs = 4; isLoadOp = false; hasAlignment = false; break;
15062 }
15063 } else {
15064 isLaneOp = true;
15065 switch (N->getOpcode()) {
15066 default: llvm_unreachable("unexpected opcode for Neon base update");
15072 NumVecs = 1; isLaneOp = false; break;
15074 NumVecs = 1; isLaneOp = false; isLoadOp = false; break;
15075 }
15076 }
15077
15078 // Find the size of memory referenced by the load/store.
15079 EVT VecTy;
15080 if (isLoadOp) {
15081 VecTy = N->getValueType(0);
15082 } else if (isIntrinsic) {
15083 VecTy = N->getOperand(AddrOpIdx+1).getValueType();
15084 } else {
15085 assert(isStore && "Node has to be a load, a store, or an intrinsic!");
15086 VecTy = N->getOperand(1).getValueType();
15087 }
15088
15089 bool isVLDDUPOp =
15092
15093 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
15094 if (isLaneOp || isVLDDUPOp)
15095 NumBytes /= VecTy.getVectorNumElements();
15096
15097 // If the increment is a constant, it must match the memory ref size.
15098 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
15100 if (NumBytes >= 3 * 16 && (!CInc || CInc->getZExtValue() != NumBytes)) {
15101 // VLD3/4 and VST3/4 for 128-bit vectors are implemented with two
15102 // separate instructions that make it harder to use a non-constant update.
15103 continue;
15104 }
15105
15106 // OK, we found an ADD we can fold into the base update.
15107 // Now, create a _UPD node, taking care of not breaking alignment.
15108
15109 EVT AlignedVecTy = VecTy;
15110 unsigned Alignment = MemN->getAlignment();
15111
15112 // If this is a less-than-standard-aligned load/store, change the type to
15113 // match the standard alignment.
15114 // The alignment is overlooked when selecting _UPD variants; and it's
15115 // easier to introduce bitcasts here than fix that.
15116 // There are 3 ways to get to this base-update combine:
15117 // - intrinsics: they are assumed to be properly aligned (to the standard
15118 // alignment of the memory type), so we don't need to do anything.
15119 // - ARMISD::VLDx nodes: they are only generated from the aforementioned
15120 // intrinsics, so, likewise, there's nothing to do.
15121 // - generic load/store instructions: the alignment is specified as an
15122 // explicit operand, rather than implicitly as the standard alignment
15123 // of the memory type (like the intrisics). We need to change the
15124 // memory type to match the explicit alignment. That way, we don't
15125 // generate non-standard-aligned ARMISD::VLDx nodes.
15126 if (isa<LSBaseSDNode>(N)) {
15127 if (Alignment == 0)
15128 Alignment = 1;
15129 if (Alignment < VecTy.getScalarSizeInBits() / 8) {
15130 MVT EltTy = MVT::getIntegerVT(Alignment * 8);
15131 assert(NumVecs == 1 && "Unexpected multi-element generic load/store.");
15132 assert(!isLaneOp && "Unexpected generic load/store lane.");
15133 unsigned NumElts = NumBytes / (EltTy.getSizeInBits() / 8);
15135 }
15136 // Don't set an explicit alignment on regular load/stores that we want
15137 // to transform to VLD/VST 1_UPD nodes.
15138 // This matches the behavior of regular load/stores, which only get an
15139 // explicit alignment if the MMO alignment is larger than the standard
15140 // alignment of the memory type.
15141 // Intrinsics, however, always get an explicit alignment, set to the
15142 // alignment of the MMO.
15143 Alignment = 1;
15144 }
15145
15146 // Create the new updating load/store node.
15147 // First, create an SDVTList for the new updating node's results.
15148 EVT Tys[6];
15149 unsigned NumResultVecs = (isLoadOp ? NumVecs : 0);
15150 unsigned n;
15151 for (n = 0; n < NumResultVecs; ++n)
15152 Tys[n] = AlignedVecTy;
15153 Tys[n++] = MVT::i32;
15154 Tys[n] = MVT::Other;
15156
15157 // Then, gather the new node's operands.
15159 Ops.push_back(N->getOperand(0)); // incoming chain
15160 Ops.push_back(N->getOperand(AddrOpIdx));
15161 Ops.push_back(Inc);
15162
15164 // Try to match the intrinsic's signature
15165 Ops.push_back(StN->getValue());
15166 } else {
15167 // Loads (and of course intrinsics) match the intrinsics' signature,
15168 // so just add all but the alignment operand.
15169 unsigned LastOperand =
15170 hasAlignment ? N->getNumOperands() - 1 : N->getNumOperands();
15171 for (unsigned i = AddrOpIdx + 1; i < LastOperand; ++i)
15172 Ops.push_back(N->getOperand(i));
15173 }
15174
15175 // For all node types, the alignment operand is always the last one.
15176 Ops.push_back(DAG.getConstant(Alignment, dl, MVT::i32));
15177
15178 // If this is a non-standard-aligned STORE, the penultimate operand is the
15179 // stored value. Bitcast it to the aligned type.
15180 if (AlignedVecTy != VecTy && N->getOpcode() == ISD::STORE) {
15181 SDValue &StVal = Ops[Ops.size()-2];
15183 }
15184
15187 MemN->getMemOperand());
15188
15189 // Update the uses.
15191 for (unsigned i = 0; i < NumResultVecs; ++i)
15192 NewResults.push_back(SDValue(UpdN.getNode(), i));
15193
15194 // If this is an non-standard-aligned LOAD, the first result is the loaded
15195 // value. Bitcast it to the expected result type.
15196 if (AlignedVecTy != VecTy && N->getOpcode() == ISD::LOAD) {
15197 SDValue &LdVal = NewResults[0];
15198 LdVal = DAG.getNode(ISD::BITCAST, dl, VecTy, LdVal);
15199 }
15200
15201 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs+1)); // chain
15202 DCI.CombineTo(N, NewResults);
15203 DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
15204
15205 break;
15206 }
15207 return SDValue();
15208}
15209
15212 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
15213 return SDValue();
15214
15215 return CombineBaseUpdate(N, DCI);
15216}
15217
15220 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
15221 return SDValue();
15222
15223 SelectionDAG &DAG = DCI.DAG;
15224 SDValue Addr = N->getOperand(2);
15226 SDLoc dl(N);
15227
15228 // For the stores, where there are multiple intrinsics we only actually want
15229 // to post-inc the last of the them.
15230 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
15231 if (IntNo == Intrinsic::arm_mve_vst2q &&
15232 cast<ConstantSDNode>(N->getOperand(5))->getZExtValue() != 1)
15233 return SDValue();
15234 if (IntNo == Intrinsic::arm_mve_vst4q &&
15235 cast<ConstantSDNode>(N->getOperand(7))->getZExtValue() != 3)
15236 return SDValue();
15237
15238 // Search for a use of the address operand that is an increment.
15239 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
15240 UE = Addr.getNode()->use_end();
15241 UI != UE; ++UI) {
15242 SDNode *User = *UI;
15243 if (User->getOpcode() != ISD::ADD ||
15244 UI.getUse().getResNo() != Addr.getResNo())
15245 continue;
15246
15247 // Check that the add is independent of the load/store. Otherwise, folding
15248 // it would create a cycle. We can avoid searching through Addr as it's a
15249 // predecessor to both.
15252 Visited.insert(Addr.getNode());
15253 Worklist.push_back(N);
15254 Worklist.push_back(User);
15255 if (SDNode::hasPredecessorHelper(N, Visited, Worklist) ||
15256 SDNode::hasPredecessorHelper(User, Visited, Worklist))
15257 continue;
15258
15259 // Find the new opcode for the updating load/store.
15260 bool isLoadOp = true;
15261 unsigned NewOpc = 0;
15262 unsigned NumVecs = 0;
15263 switch (IntNo) {
15264 default:
15265 llvm_unreachable("unexpected intrinsic for MVE VLDn combine");
15266 case Intrinsic::arm_mve_vld2q:
15268 NumVecs = 2;
15269 break;
15270 case Intrinsic::arm_mve_vld4q:
15272 NumVecs = 4;
15273 break;
15274 case Intrinsic::arm_mve_vst2q:
15276 NumVecs = 2;
15277 isLoadOp = false;
15278 break;
15279 case Intrinsic::arm_mve_vst4q:
15281 NumVecs = 4;
15282 isLoadOp = false;
15283 break;
15284 }
15285
15286 // Find the size of memory referenced by the load/store.
15287 EVT VecTy;
15288 if (isLoadOp) {
15289 VecTy = N->getValueType(0);
15290 } else {
15291 VecTy = N->getOperand(3).getValueType();
15292 }
15293
15294 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
15295
15296 // If the increment is a constant, it must match the memory ref size.
15297 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
15299 if (!CInc || CInc->getZExtValue() != NumBytes)
15300 continue;
15301
15302 // Create the new updating load/store node.
15303 // First, create an SDVTList for the new updating node's results.
15304 EVT Tys[6];
15305 unsigned NumResultVecs = (isLoadOp ? NumVecs : 0);
15306 unsigned n;
15307 for (n = 0; n < NumResultVecs; ++n)
15308 Tys[n] = VecTy;
15309 Tys[n++] = MVT::i32;
15310 Tys[n] = MVT::Other;
15312
15313 // Then, gather the new node's operands.
15315 Ops.push_back(N->getOperand(0)); // incoming chain
15316 Ops.push_back(N->getOperand(2)); // ptr
15317 Ops.push_back(Inc);
15318
15319 for (unsigned i = 3; i < N->getNumOperands(); ++i)
15320 Ops.push_back(N->getOperand(i));
15321
15322 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, VecTy,
15323 MemN->getMemOperand());
15324
15325 // Update the uses.
15327 for (unsigned i = 0; i < NumResultVecs; ++i)
15328 NewResults.push_back(SDValue(UpdN.getNode(), i));
15329
15330 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); // chain
15331 DCI.CombineTo(N, NewResults);
15332 DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
15333
15334 break;
15335 }
15336
15337 return SDValue();
15338}
15339
15340/// CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a
15341/// vldN-lane (N > 1) intrinsic, and if all the other uses of that intrinsic
15342/// are also VDUPLANEs. If so, combine them to a vldN-dup operation and
15343/// return true.
15345 SelectionDAG &DAG = DCI.DAG;
15346 EVT VT = N->getValueType(0);
15347 // vldN-dup instructions only support 64-bit vectors for N > 1.
15348 if (!VT.is64BitVector())
15349 return false;
15350
15351 // Check if the VDUPLANE operand is a vldN-dup intrinsic.
15352 SDNode *VLD = N->getOperand(0).getNode();
15353 if (VLD->getOpcode() != ISD::INTRINSIC_W_CHAIN)
15354 return false;
15355 unsigned NumVecs = 0;
15356 unsigned NewOpc = 0;
15357 unsigned IntNo = cast<ConstantSDNode>(VLD->getOperand(1))->getZExtValue();
15358 if (IntNo == Intrinsic::arm_neon_vld2lane) {
15359 NumVecs = 2;
15361 } else if (IntNo == Intrinsic::arm_neon_vld3lane) {
15362 NumVecs = 3;
15364 } else if (IntNo == Intrinsic::arm_neon_vld4lane) {
15365 NumVecs = 4;
15367 } else {
15368 return false;
15369 }
15370
15371 // First check that all the vldN-lane uses are VDUPLANEs and that the lane
15372 // numbers match the load.
15373 unsigned VLDLaneNo =
15374 cast<ConstantSDNode>(VLD->getOperand(NumVecs+3))->getZExtValue();
15375 for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();
15376 UI != UE; ++UI) {
15377 // Ignore uses of the chain result.
15378 if (UI.getUse().getResNo() == NumVecs)
15379 continue;
15380 SDNode *User = *UI;
15381 if (User->getOpcode() != ARMISD::VDUPLANE ||
15382 VLDLaneNo != cast<ConstantSDNode>(User->getOperand(1))->getZExtValue())
15383 return false;
15384 }
15385
15386 // Create the vldN-dup node.
15387 EVT Tys[5];
15388 unsigned n;
15389 for (n = 0; n < NumVecs; ++n)
15390 Tys[n] = VT;
15391 Tys[n] = MVT::Other;
15393 SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) };
15396 Ops, VLDMemInt->getMemoryVT(),
15397 VLDMemInt->getMemOperand());
15398
15399 // Update the uses.
15400 for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();
15401 UI != UE; ++UI) {
15402 unsigned ResNo = UI.getUse().getResNo();
15403 // Ignore uses of the chain result.
15404 if (ResNo == NumVecs)
15405 continue;
15406 SDNode *User = *UI;
15407 DCI.CombineTo(User, SDValue(VLDDup.getNode(), ResNo));
15408 }
15409
15410 // Now the vldN-lane intrinsic is dead except for its chain result.
15411 // Update uses of the chain.
15412 std::vector<SDValue> VLDDupResults;
15413 for (unsigned n = 0; n < NumVecs; ++n)
15414 VLDDupResults.push_back(SDValue(VLDDup.getNode(), n));
15415 VLDDupResults.push_back(SDValue(VLDDup.getNode(), NumVecs));
15416 DCI.CombineTo(VLD, VLDDupResults);
15417
15418 return true;
15419}
15420
15421/// PerformVDUPLANECombine - Target-specific dag combine xforms for
15422/// ARMISD::VDUPLANE.
15425 const ARMSubtarget *Subtarget) {
15426 SDValue Op = N->getOperand(0);
15427 EVT VT = N->getValueType(0);
15428
15429 // On MVE, we just convert the VDUPLANE to a VDUP with an extract.
15430 if (Subtarget->hasMVEIntegerOps()) {
15432 // We need to ensure we are creating a legal type.
15433 if (!DCI.DAG.getTargetLoweringInfo().isTypeLegal(ExtractVT))
15435 SDValue Extract = DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), ExtractVT,
15436 N->getOperand(0), N->getOperand(1));
15437 return DCI.DAG.getNode(ARMISD::VDUP, SDLoc(N), VT, Extract);
15438 }
15439
15440 // If the source is a vldN-lane (N > 1) intrinsic, and all the other uses
15441 // of that intrinsic are also VDUPLANEs, combine them to a vldN-dup operation.
15442 if (CombineVLDDUP(N, DCI))
15443 return SDValue(N, 0);
15444
15445 // If the source is already a VMOVIMM or VMVNIMM splat, the VDUPLANE is
15446 // redundant. Ignore bit_converts for now; element sizes are checked below.
15447 while (Op.getOpcode() == ISD::BITCAST)
15448 Op = Op.getOperand(0);
15449 if (Op.getOpcode() != ARMISD::VMOVIMM && Op.getOpcode() != ARMISD::VMVNIMM)
15450 return SDValue();
15451
15452 // Make sure the VMOV element size is not bigger than the VDUPLANE elements.
15453 unsigned EltSize = Op.getScalarValueSizeInBits();
15454 // The canonical VMOV for a zero vector uses a 32-bit element size.
15455 unsigned Imm = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
15456 unsigned EltBits;
15457 if (ARM_AM::decodeVMOVModImm(Imm, EltBits) == 0)
15458 EltSize = 8;
15459 if (EltSize > VT.getScalarSizeInBits())
15460 return SDValue();
15461
15462 return DCI.DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
15463}
15464
15465/// PerformVDUPCombine - Target-specific dag combine xforms for ARMISD::VDUP.
15467 const ARMSubtarget *Subtarget) {
15468 SDValue Op = N->getOperand(0);
15469 SDLoc dl(N);
15470
15471 if (Subtarget->hasMVEIntegerOps()) {
15472 // Convert VDUP f32 -> VDUP BITCAST i32 under MVE, as we know the value will
15473 // need to come from a GPR.
15474 if (Op.getValueType() == MVT::f32)
15475 return DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0),
15476 DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op));
15477 else if (Op.getValueType() == MVT::f16)
15478 return DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0),
15479 DAG.getNode(ARMISD::VMOVrh, dl, MVT::i32, Op));
15480 }
15481
15482 if (!Subtarget->hasNEON())
15483 return SDValue();
15484
15485 // Match VDUP(LOAD) -> VLD1DUP.
15486 // We match this pattern here rather than waiting for isel because the
15487 // transform is only legal for unindexed loads.
15488 LoadSDNode *LD = dyn_cast<LoadSDNode>(Op.getNode());
15489 if (LD && Op.hasOneUse() && LD->isUnindexed() &&
15490 LD->getMemoryVT() == N->getValueType(0).getVectorElementType()) {
15491 SDValue Ops[] = {LD->getOperand(0), LD->getOperand(1),
15492 DAG.getConstant(LD->getAlignment(), SDLoc(N), MVT::i32)};
15493 SDVTList SDTys = DAG.getVTList(N->getValueType(0), MVT::Other);
15494 SDValue VLDDup =
15496 LD->getMemoryVT(), LD->getMemOperand());
15497 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), VLDDup.getValue(1));
15498 return VLDDup;
15499 }
15500
15501 return SDValue();
15502}
15503
15506 EVT VT = N->getValueType(0);
15507
15508 // If this is a legal vector load, try to combine it into a VLD1_UPD.
15509 if (ISD::isNormalLoad(N) && VT.isVector() &&
15510 DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT))
15511 return CombineBaseUpdate(N, DCI);
15512
15513 return SDValue();
15514}
15515
15516// Optimize trunc store (of multiple scalars) to shuffle and store. First,
15517// pack all of the elements in one place. Next, store to memory in fewer
15518// chunks.
15520 SelectionDAG &DAG) {
15521 SDValue StVal = St->getValue();
15522 EVT VT = StVal.getValueType();
15523 if (!St->isTruncatingStore() || !VT.isVector())
15524 return SDValue();
15525 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15526 EVT StVT = St->getMemoryVT();
15527 unsigned NumElems = VT.getVectorNumElements();
15528 assert(StVT != VT && "Cannot truncate to the same type");
15529 unsigned FromEltSz = VT.getScalarSizeInBits();
15530 unsigned ToEltSz = StVT.getScalarSizeInBits();
15531
15532 // From, To sizes and ElemCount must be pow of two
15534 return SDValue();
15535
15536 // We are going to use the original vector elt for storing.
15537 // Accumulated smaller vector elements must be a multiple of the store size.
15538 if (0 != (NumElems * FromEltSz) % ToEltSz)
15539 return SDValue();
15540
15541 unsigned SizeRatio = FromEltSz / ToEltSz;
15543
15544 // Create a type on which we perform the shuffle.
15545 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(),
15547 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
15548
15549 SDLoc DL(St);
15552 for (unsigned i = 0; i < NumElems; ++i)
15553 ShuffleVec[i] = DAG.getDataLayout().isBigEndian() ? (i + 1) * SizeRatio - 1
15554 : i * SizeRatio;
15555
15556 // Can't shuffle using an illegal type.
15557 if (!TLI.isTypeLegal(WideVecVT))
15558 return SDValue();
15559
15561 WideVecVT, DL, WideVec, DAG.getUNDEF(WideVec.getValueType()), ShuffleVec);
15562 // At this point all of the data is stored at the bottom of the
15563 // register. We now need to save it to mem.
15564
15565 // Find the largest store unit
15567 for (MVT Tp : MVT::integer_valuetypes()) {
15568 if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz)
15569 StoreType = Tp;
15570 }
15571 // Didn't find a legal store type.
15572 if (!TLI.isTypeLegal(StoreType))
15573 return SDValue();
15574
15575 // Bitcast the original vector into a vector of store-size units
15576 EVT StoreVecVT =
15578 VT.getSizeInBits() / EVT(StoreType).getSizeInBits());
15579 assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
15582 SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, DL,
15583 TLI.getPointerTy(DAG.getDataLayout()));
15584 SDValue BasePtr = St->getBasePtr();
15585
15586 // Perform one or more big stores into memory.
15587 unsigned E = (ToEltSz * NumElems) / StoreType.getSizeInBits();
15588 for (unsigned I = 0; I < E; I++) {
15591 SDValue Ch =
15592 DAG.getStore(St->getChain(), DL, SubVec, BasePtr, St->getPointerInfo(),
15593 St->getAlignment(), St->getMemOperand()->getFlags());
15594 BasePtr =
15595 DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr, Increment);
15596 Chains.push_back(Ch);
15597 }
15598 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
15599}
15600
15601// Try taking a single vector store from an fpround (which would otherwise turn
15602// into an expensive buildvector) and splitting it into a series of narrowing
15603// stores.
15605 SelectionDAG &DAG) {
15606 if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed())
15607 return SDValue();
15608 SDValue Trunc = St->getValue();
15609 if (Trunc->getOpcode() != ISD::FP_ROUND)
15610 return SDValue();
15611 EVT FromVT = Trunc->getOperand(0).getValueType();
15612 EVT ToVT = Trunc.getValueType();
15613 if (!ToVT.isVector())
15614 return SDValue();
15615 assert(FromVT.getVectorNumElements() == ToVT.getVectorNumElements());
15616 EVT ToEltVT = ToVT.getVectorElementType();
15617 EVT FromEltVT = FromVT.getVectorElementType();
15618
15619 if (FromEltVT != MVT::f32 || ToEltVT != MVT::f16)
15620 return SDValue();
15621
15622 unsigned NumElements = 4;
15623 if (FromVT.getVectorNumElements() % NumElements != 0)
15624 return SDValue();
15625
15626 // Test if the Trunc will be convertable to a VMOVN with a shuffle, and if so
15627 // use the VMOVN over splitting the store. We are looking for patterns of:
15628 // !rev: 0 N 1 N+1 2 N+2 ...
15629 // rev: N 0 N+1 1 N+2 2 ...
15630 // The shuffle may either be a single source (in which case N = NumElts/2) or
15631 // two inputs extended with concat to the same size (in which case N =
15632 // NumElts).
15633 auto isVMOVNShuffle = [&](ShuffleVectorSDNode *SVN, bool Rev) {
15634 ArrayRef<int> M = SVN->getMask();
15635 unsigned NumElts = ToVT.getVectorNumElements();
15636 if (SVN->getOperand(1).isUndef())
15637 NumElts /= 2;
15638
15639 unsigned Off0 = Rev ? NumElts : 0;
15640 unsigned Off1 = Rev ? 0 : NumElts;
15641
15642 for (unsigned I = 0; I < NumElts; I += 2) {
15643 if (M[I] >= 0 && M[I] != (int)(Off0 + I / 2))
15644 return false;
15645 if (M[I + 1] >= 0 && M[I + 1] != (int)(Off1 + I / 2))
15646 return false;
15647 }
15648
15649 return true;
15650 };
15651
15652 if (auto *Shuffle = dyn_cast<ShuffleVectorSDNode>(Trunc.getOperand(0)))
15653 if (isVMOVNShuffle(Shuffle, false) || isVMOVNShuffle(Shuffle, true))
15654 return SDValue();
15655
15656 LLVMContext &C = *DAG.getContext();
15657 SDLoc DL(St);
15658 // Details about the old store
15659 SDValue Ch = St->getChain();
15660 SDValue BasePtr = St->getBasePtr();
15661 Align Alignment = St->getOriginalAlign();
15662 MachineMemOperand::Flags MMOFlags = St->getMemOperand()->getFlags();
15663 AAMDNodes AAInfo = St->getAAInfo();
15664
15665 // We split the store into slices of NumElements. fp16 trunc stores are vcvt
15666 // and then stored as truncating integer stores.
15667 EVT NewFromVT = EVT::getVectorVT(C, FromEltVT, NumElements);
15669 C, EVT::getIntegerVT(C, ToEltVT.getSizeInBits()), NumElements);
15670
15672 for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {
15673 unsigned NewOffset = i * NumElements * ToEltVT.getSizeInBits() / 8;
15674 SDValue NewPtr =
15676
15677 SDValue Extract =
15679 DAG.getConstant(i * NumElements, DL, MVT::i32));
15680
15683 Extract, DAG.getConstant(0, DL, MVT::i32));
15685
15686 SDValue Store = DAG.getTruncStore(
15687 Ch, DL, Extract, NewPtr, St->getPointerInfo().getWithOffset(NewOffset),
15688 NewToVT, Alignment.value(), MMOFlags, AAInfo);
15689 Stores.push_back(Store);
15690 }
15691 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
15692}
15693
15694// Try taking a single vector store from an MVETRUNC (which would otherwise turn
15695// into an expensive buildvector) and splitting it into a series of narrowing
15696// stores.
15698 SelectionDAG &DAG) {
15699 if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed())
15700 return SDValue();
15701 SDValue Trunc = St->getValue();
15702 if (Trunc->getOpcode() != ARMISD::MVETRUNC)
15703 return SDValue();
15704 EVT FromVT = Trunc->getOperand(0).getValueType();
15705 EVT ToVT = Trunc.getValueType();
15706
15707 LLVMContext &C = *DAG.getContext();
15708 SDLoc DL(St);
15709 // Details about the old store
15710 SDValue Ch = St->getChain();
15711 SDValue BasePtr = St->getBasePtr();
15712 Align Alignment = St->getOriginalAlign();
15713 MachineMemOperand::Flags MMOFlags = St->getMemOperand()->getFlags();
15714 AAMDNodes AAInfo = St->getAAInfo();
15715
15716 EVT NewToVT = EVT::getVectorVT(C, ToVT.getVectorElementType(),
15717 FromVT.getVectorNumElements());
15718
15720 for (unsigned i = 0; i < Trunc.getNumOperands(); i++) {
15721 unsigned NewOffset =
15722 i * FromVT.getVectorNumElements() * ToVT.getScalarSizeInBits() / 8;
15723 SDValue NewPtr =
15725
15726 SDValue Extract = Trunc.getOperand(i);
15727 SDValue Store = DAG.getTruncStore(
15728 Ch, DL, Extract, NewPtr, St->getPointerInfo().getWithOffset(NewOffset),
15729 NewToVT, Alignment.value(), MMOFlags, AAInfo);
15730 Stores.push_back(Store);
15731 }
15732 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
15733}
15734
15735// Given a floating point store from an extracted vector, with an integer
15736// VGETLANE that already exists, store the existing VGETLANEu directly. This can
15737// help reduce fp register pressure, doesn't require the fp extract and allows
15738// use of more integer post-inc stores not available with vstr.
15740 if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed())
15741 return SDValue();
15742 SDValue Extract = St->getValue();
15743 EVT VT = Extract.getValueType();
15744 // For now only uses f16. This may be useful for f32 too, but that will
15745 // be bitcast(extract), not the VGETLANEu we currently check here.
15746 if (VT != MVT::f16 || Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
15747 return SDValue();
15748
15749 SDNode *GetLane =
15751 {Extract.getOperand(0), Extract.getOperand(1)});
15752 if (!GetLane)
15753 return SDValue();
15754
15755 LLVMContext &C = *DAG.getContext();
15756 SDLoc DL(St);
15757 // Create a new integer store to replace the existing floating point version.
15758 SDValue Ch = St->getChain();
15759 SDValue BasePtr = St->getBasePtr();
15760 Align Alignment = St->getOriginalAlign();
15761 MachineMemOperand::Flags MMOFlags = St->getMemOperand()->getFlags();
15762 AAMDNodes AAInfo = St->getAAInfo();
15764 SDValue Store = DAG.getTruncStore(Ch, DL, SDValue(GetLane, 0), BasePtr,
15765 St->getPointerInfo(), NewToVT,
15766 Alignment.value(), MMOFlags, AAInfo);
15767
15768 return Store;
15769}
15770
15771/// PerformSTORECombine - Target-specific dag combine xforms for
15772/// ISD::STORE.
15775 const ARMSubtarget *Subtarget) {
15777 if (St->isVolatile())
15778 return SDValue();
15779 SDValue StVal = St->getValue();
15780 EVT VT = StVal.getValueType();
15781
15782 if (Subtarget->hasNEON())
15783 if (SDValue Store = PerformTruncatingStoreCombine(St, DCI.DAG))
15784 return Store;
15785
15786 if (Subtarget->hasMVEIntegerOps()) {
15788 return NewToken;
15790 return NewChain;
15791 if (SDValue NewToken =
15793 return NewToken;
15794 }
15795
15796 if (!ISD::isNormalStore(St))
15797 return SDValue();
15798
15799 // Split a store of a VMOVDRR into two integer stores to avoid mixing NEON and
15800 // ARM stores of arguments in the same cache line.
15801 if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR &&
15802 StVal.getNode()->hasOneUse()) {
15803 SelectionDAG &DAG = DCI.DAG;
15804 bool isBigEndian = DAG.getDataLayout().isBigEndian();
15805 SDLoc DL(St);
15806 SDValue BasePtr = St->getBasePtr();
15807 SDValue NewST1 = DAG.getStore(
15808 St->getChain(), DL, StVal.getNode()->getOperand(isBigEndian ? 1 : 0),
15809 BasePtr, St->getPointerInfo(), St->getOriginalAlign(),
15810 St->getMemOperand()->getFlags());
15811
15812 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
15813 DAG.getConstant(4, DL, MVT::i32));
15814 return DAG.getStore(NewST1.getValue(0), DL,
15815 StVal.getNode()->getOperand(isBigEndian ? 0 : 1),
15816 OffsetPtr, St->getPointerInfo().getWithOffset(4),
15817 St->getOriginalAlign(),
15818 St->getMemOperand()->getFlags());
15819 }
15820
15821 if (StVal.getValueType() == MVT::i64 &&
15822 StVal.getNode()->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
15823
15824 // Bitcast an i64 store extracted from a vector to f64.
15825 // Otherwise, the i64 value will be legalized to a pair of i32 values.
15826 SelectionDAG &DAG = DCI.DAG;
15827 SDLoc dl(StVal);
15828 SDValue IntVec = StVal.getOperand(0);
15829 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
15830 IntVec.getValueType().getVectorNumElements());
15831 SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, IntVec);
15833 Vec, StVal.getOperand(1));
15834 dl = SDLoc(N);
15836 // Make the DAGCombiner fold the bitcasts.
15837 DCI.AddToWorklist(Vec.getNode());
15838 DCI.AddToWorklist(ExtElt.getNode());
15839 DCI.AddToWorklist(V.getNode());
15840 return DAG.getStore(St->getChain(), dl, V, St->getBasePtr(),
15841 St->getPointerInfo(), St->getAlignment(),
15842 St->getMemOperand()->getFlags(), St->getAAInfo());
15843 }
15844
15845 // If this is a legal vector store, try to combine it into a VST1_UPD.
15846 if (Subtarget->hasNEON() && ISD::isNormalStore(N) && VT.isVector() &&
15847 DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT))
15848 return CombineBaseUpdate(N, DCI);
15849
15850 return SDValue();
15851}
15852
15853/// PerformVCVTCombine - VCVT (floating-point to fixed-point, Advanced SIMD)
15854/// can replace combinations of VMUL and VCVT (floating-point to integer)
15855/// when the VMUL has a constant operand that is a power of 2.
15856///
15857/// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>):
15858/// vmul.f32 d16, d17, d16
15859/// vcvt.s32.f32 d16, d16
15860/// becomes:
15861/// vcvt.s32.f32 d16, d16, #3
15863 const ARMSubtarget *Subtarget) {
15864 if (!Subtarget->hasNEON())
15865 return SDValue();
15866
15867 SDValue Op = N->getOperand(0);
15868 if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() ||
15869 Op.getOpcode() != ISD::FMUL)
15870 return SDValue();
15871
15872 SDValue ConstVec = Op->getOperand(1);
15874 return SDValue();
15875
15876 MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
15877 uint32_t FloatBits = FloatTy.getSizeInBits();
15878 MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
15879 uint32_t IntBits = IntTy.getSizeInBits();
15880 unsigned NumLanes = Op.getValueType().getVectorNumElements();
15881 if (FloatBits != 32 || IntBits > 32 || (NumLanes != 4 && NumLanes != 2)) {
15882 // These instructions only exist converting from f32 to i32. We can handle
15883 // smaller integers by generating an extra truncate, but larger ones would
15884 // be lossy. We also can't handle anything other than 2 or 4 lanes, since
15885 // these intructions only support v2i32/v4i32 types.
15886 return SDValue();
15887 }
15888
15891 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33);
15892 if (C == -1 || C == 0 || C > 32)
15893 return SDValue();
15894
15895 SDLoc dl(N);
15896 bool isSigned = N->getOpcode() == ISD::FP_TO_SINT;
15897 unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfp2fxs :
15898 Intrinsic::arm_neon_vcvtfp2fxu;
15899 SDValue FixConv = DAG.getNode(
15901 DAG.getConstant(IntrinsicOpcode, dl, MVT::i32), Op->getOperand(0),
15902 DAG.getConstant(C, dl, MVT::i32));
15903
15904 if (IntBits < FloatBits)
15905 FixConv = DAG.getNode(ISD::TRUNCATE, dl, N->getValueType(0), FixConv);
15906
15907 return FixConv;
15908}
15909
15910/// PerformVDIVCombine - VCVT (fixed-point to floating-point, Advanced SIMD)
15911/// can replace combinations of VCVT (integer to floating-point) and VDIV
15912/// when the VDIV has a constant operand that is a power of 2.
15913///
15914/// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>):
15915/// vcvt.f32.s32 d16, d16
15916/// vdiv.f32 d16, d17, d16
15917/// becomes:
15918/// vcvt.f32.s32 d16, d16, #3
15920 const ARMSubtarget *Subtarget) {
15921 if (!Subtarget->hasNEON())
15922 return SDValue();
15923
15924 SDValue Op = N->getOperand(0);
15925 unsigned OpOpcode = Op.getNode()->getOpcode();
15926 if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple() ||
15928 return SDValue();
15929
15930 SDValue ConstVec = N->getOperand(1);
15932 return SDValue();
15933
15934 MVT FloatTy = N->getSimpleValueType(0).getVectorElementType();
15935 uint32_t FloatBits = FloatTy.getSizeInBits();
15936 MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType();
15937 uint32_t IntBits = IntTy.getSizeInBits();
15938 unsigned NumLanes = Op.getValueType().getVectorNumElements();
15939 if (FloatBits != 32 || IntBits > 32 || (NumLanes != 4 && NumLanes != 2)) {
15940 // These instructions only exist converting from i32 to f32. We can handle
15941 // smaller integers by generating an extra extend, but larger ones would
15942 // be lossy. We also can't handle anything other than 2 or 4 lanes, since
15943 // these intructions only support v2i32/v4i32 types.
15944 return SDValue();
15945 }
15946
15949 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33);
15950 if (C == -1 || C == 0 || C > 32)
15951 return SDValue();
15952
15953 SDLoc dl(N);
15954 bool isSigned = OpOpcode == ISD::SINT_TO_FP;
15955 SDValue ConvInput = Op.getOperand(0);
15956 if (IntBits < FloatBits)
15958 dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32,
15959 ConvInput);
15960
15961 unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfxs2fp :
15962 Intrinsic::arm_neon_vcvtfxu2fp;
15963 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl,
15964 Op.getValueType(),
15966 ConvInput, DAG.getConstant(C, dl, MVT::i32));
15967}
15968
15970 const ARMSubtarget *ST) {
15971 if (!ST->hasMVEIntegerOps())
15972 return SDValue();
15973
15974 assert(N->getOpcode() == ISD::VECREDUCE_ADD);
15975 EVT ResVT = N->getValueType(0);
15976 SDValue N0 = N->getOperand(0);
15977 SDLoc dl(N);
15978
15979 // We are looking for something that will have illegal types if left alone,
15980 // but that we can convert to a single instruction under MVE. For example
15981 // vecreduce_add(sext(A, v8i32)) => VADDV.s16 A
15982 // or
15983 // vecreduce_add(mul(zext(A, v16i32), zext(B, v16i32))) => VMLADAV.u8 A, B
15984
15985 // The legal cases are:
15986 // VADDV u/s 8/16/32
15987 // VMLAV u/s 8/16/32
15988 // VADDLV u/s 32
15989 // VMLALV u/s 16/32
15990
15991 // If the input vector is smaller than legal (v4i8/v4i16 for example) we can
15992 // extend it and use v4i32 instead.
15994 EVT AVT = A.getValueType();
15995 return any_of(ExtTypes, [&](MVT Ty) {
15996 return AVT.getVectorNumElements() == Ty.getVectorNumElements() &&
15997 AVT.bitsLE(Ty);
15998 });
15999 };
16000 auto ExtendIfNeeded = [&](SDValue A, unsigned ExtendCode) {
16001 EVT AVT = A.getValueType();
16002 if (!AVT.is128BitVector())
16003 A = DAG.getNode(ExtendCode, dl,
16004 AVT.changeVectorElementType(MVT::getIntegerVT(
16005 128 / AVT.getVectorMinNumElements())),
16006 A);
16007 return A;
16008 };
16009 auto IsVADDV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes) {
16010 if (ResVT != RetTy || N0->getOpcode() != ExtendCode)
16011 return SDValue();
16012 SDValue A = N0->getOperand(0);
16014 return ExtendIfNeeded(A, ExtendCode);
16015 return SDValue();
16016 };
16017 auto IsPredVADDV = [&](MVT RetTy, unsigned ExtendCode,
16019 if (ResVT != RetTy || N0->getOpcode() != ISD::VSELECT ||
16021 return SDValue();
16022 Mask = N0->getOperand(0);
16023 SDValue Ext = N0->getOperand(1);
16024 if (Ext->getOpcode() != ExtendCode)
16025 return SDValue();
16026 SDValue A = Ext->getOperand(0);
16028 return ExtendIfNeeded(A, ExtendCode);
16029 return SDValue();
16030 };
16031 auto IsVMLAV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes,
16032 SDValue &A, SDValue &B) {
16033 // For a vmla we are trying to match a larger pattern:
16034 // ExtA = sext/zext A
16035 // ExtB = sext/zext B
16036 // Mul = mul ExtA, ExtB
16037 // vecreduce.add Mul
16038 // There might also be en extra extend between the mul and the addreduce, so
16039 // long as the bitwidth is high enough to make them equivalent (for example
16040 // original v8i16 might be mul at v8i32 and the reduce happens at v8i64).
16041 if (ResVT != RetTy)
16042 return false;
16043 SDValue Mul = N0;
16044 if (Mul->getOpcode() == ExtendCode &&
16045 Mul->getOperand(0).getScalarValueSizeInBits() * 2 >=
16046 ResVT.getScalarSizeInBits())
16047 Mul = Mul->getOperand(0);
16048 if (Mul->getOpcode() != ISD::MUL)
16049 return false;
16050 SDValue ExtA = Mul->getOperand(0);
16051 SDValue ExtB = Mul->getOperand(1);
16052 if (ExtA->getOpcode() != ExtendCode || ExtB->getOpcode() != ExtendCode)
16053 return false;
16054 A = ExtA->getOperand(0);
16055 B = ExtB->getOperand(0);
16059 return true;
16060 }
16061 return false;
16062 };
16063 auto IsPredVMLAV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes,
16064 SDValue &A, SDValue &B, SDValue &Mask) {
16065 // Same as the pattern above with a select for the zero predicated lanes
16066 // ExtA = sext/zext A
16067 // ExtB = sext/zext B
16068 // Mul = mul ExtA, ExtB
16069 // N0 = select Mask, Mul, 0
16070 // vecreduce.add N0
16071 if (ResVT != RetTy || N0->getOpcode() != ISD::VSELECT ||
16073 return false;
16074 Mask = N0->getOperand(0);
16075 SDValue Mul = N0->getOperand(1);
16076 if (Mul->getOpcode() == ExtendCode &&
16077 Mul->getOperand(0).getScalarValueSizeInBits() * 2 >=
16078 ResVT.getScalarSizeInBits())
16079 Mul = Mul->getOperand(0);
16080 if (Mul->getOpcode() != ISD::MUL)
16081 return false;
16082 SDValue ExtA = Mul->getOperand(0);
16083 SDValue ExtB = Mul->getOperand(1);
16084 if (ExtA->getOpcode() != ExtendCode || ExtB->getOpcode() != ExtendCode)
16085 return false;
16086 A = ExtA->getOperand(0);
16087 B = ExtB->getOperand(0);
16091 return true;
16092 }
16093 return false;
16094 };
16095 auto Create64bitNode = [&](unsigned Opcode, ArrayRef<SDValue> Ops) {
16096 // Split illegal MVT::v16i8->i64 vector reductions into two legal v8i16->i64
16097 // reductions. The operands are extended with MVEEXT, but as they are
16098 // reductions the lane orders do not matter. MVEEXT may be combined with
16099 // loads to produce two extending loads, or else they will be expanded to
16100 // VREV/VMOVL.
16101 EVT VT = Ops[0].getValueType();
16102 if (VT == MVT::v16i8) {
16103 assert((Opcode == ARMISD::VMLALVs || Opcode == ARMISD::VMLALVu) &&
16104 "Unexpected illegal long reduction opcode");
16105 bool IsUnsigned = Opcode == ARMISD::VMLALVu;
16106
16107 SDValue Ext0 =
16108 DAG.getNode(IsUnsigned ? ARMISD::MVEZEXT : ARMISD::MVESEXT, dl,
16109 DAG.getVTList(MVT::v8i16, MVT::v8i16), Ops[0]);
16110 SDValue Ext1 =
16111 DAG.getNode(IsUnsigned ? ARMISD::MVEZEXT : ARMISD::MVESEXT, dl,
16112 DAG.getVTList(MVT::v8i16, MVT::v8i16), Ops[1]);
16113
16114 SDValue MLA0 = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32),
16115 Ext0, Ext1);
16116 SDValue MLA1 =
16117 DAG.getNode(IsUnsigned ? ARMISD::VMLALVAu : ARMISD::VMLALVAs, dl,
16118 DAG.getVTList(MVT::i32, MVT::i32), MLA0, MLA0.getValue(1),
16119 Ext0.getValue(1), Ext1.getValue(1));
16120 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, MLA1, MLA1.getValue(1));
16121 }
16122 SDValue Node = DAG.getNode(Opcode, dl, {MVT::i32, MVT::i32}, Ops);
16123 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Node,
16124 SDValue(Node.getNode(), 1));
16125 };
16126
16128 return DAG.getNode(ARMISD::VADDVs, dl, ResVT, A);
16130 return DAG.getNode(ARMISD::VADDVu, dl, ResVT, A);
16136 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
16137 DAG.getNode(ARMISD::VADDVs, dl, MVT::i32, A));
16139 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
16140 DAG.getNode(ARMISD::VADDVu, dl, MVT::i32, A));
16141
16142 SDValue Mask;
16144 return DAG.getNode(ARMISD::VADDVps, dl, ResVT, A, Mask);
16146 return DAG.getNode(ARMISD::VADDVpu, dl, ResVT, A, Mask);
16148 return Create64bitNode(ARMISD::VADDLVps, {A, Mask});
16150 return Create64bitNode(ARMISD::VADDLVpu, {A, Mask});
16152 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
16153 DAG.getNode(ARMISD::VADDVps, dl, MVT::i32, A, Mask));
16155 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
16156 DAG.getNode(ARMISD::VADDVpu, dl, MVT::i32, A, Mask));
16157
16158 SDValue A, B;
16160 return DAG.getNode(ARMISD::VMLAVs, dl, ResVT, A, B);
16162 return DAG.getNode(ARMISD::VMLAVu, dl, ResVT, A, B);
16164 A, B))
16165 return Create64bitNode(ARMISD::VMLALVs, {A, B});
16167 A, B))
16168 return Create64bitNode(ARMISD::VMLALVu, {A, B});
16170 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
16171 DAG.getNode(ARMISD::VMLAVs, dl, MVT::i32, A, B));
16173 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
16174 DAG.getNode(ARMISD::VMLAVu, dl, MVT::i32, A, B));
16175
16177 Mask))
16178 return DAG.getNode(ARMISD::VMLAVps, dl, ResVT, A, B, Mask);
16180 Mask))
16181 return DAG.getNode(ARMISD::VMLAVpu, dl, ResVT, A, B, Mask);
16183 Mask))
16184 return Create64bitNode(ARMISD::VMLALVps, {A, B, Mask});
16186 Mask))
16187 return Create64bitNode(ARMISD::VMLALVpu, {A, B, Mask});
16189 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
16190 DAG.getNode(ARMISD::VMLAVps, dl, MVT::i32, A, B, Mask));
16192 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
16193 DAG.getNode(ARMISD::VMLAVpu, dl, MVT::i32, A, B, Mask));
16194
16195 // Some complications. We can get a case where the two inputs of the mul are
16196 // the same, then the output sext will have been helpfully converted to a
16197 // zext. Turn it back.
16198 SDValue Op = N0;
16199 if (Op->getOpcode() == ISD::VSELECT)
16200 Op = Op->getOperand(1);
16201 if (Op->getOpcode() == ISD::ZERO_EXTEND &&
16202 Op->getOperand(0)->getOpcode() == ISD::MUL) {
16203 SDValue Mul = Op->getOperand(0);
16204 if (Mul->getOperand(0) == Mul->getOperand(1) &&
16205 Mul->getOperand(0)->getOpcode() == ISD::SIGN_EXTEND) {
16206 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, N0->getValueType(0), Mul);
16207 if (Op != N0)
16208 Ext = DAG.getNode(ISD::VSELECT, dl, N0->getValueType(0),
16209 N0->getOperand(0), Ext, N0->getOperand(2));
16210 return DAG.getNode(ISD::VECREDUCE_ADD, dl, ResVT, Ext);
16211 }
16212 }
16213
16214 return SDValue();
16215}
16216
16219 SDValue Op0 = N->getOperand(0);
16220 SDValue Op1 = N->getOperand(1);
16221 unsigned IsTop = N->getConstantOperandVal(2);
16222
16223 // VMOVNT a undef -> a
16224 // VMOVNB a undef -> a
16225 // VMOVNB undef a -> a
16226 if (Op1->isUndef())
16227 return Op0;
16228 if (Op0->isUndef() && !IsTop)
16229 return Op1;
16230
16231 // VMOVNt(c, VQMOVNb(a, b)) => VQMOVNt(c, b)
16232 // VMOVNb(c, VQMOVNb(a, b)) => VQMOVNb(c, b)
16233 if ((Op1->getOpcode() == ARMISD::VQMOVNs ||
16234 Op1->getOpcode() == ARMISD::VQMOVNu) &&
16235 Op1->getConstantOperandVal(2) == 0)
16236 return DCI.DAG.getNode(Op1->getOpcode(), SDLoc(Op1), N->getValueType(0),
16237 Op0, Op1->getOperand(1), N->getOperand(2));
16238
16239 // Only the bottom lanes from Qm (Op1) and either the top or bottom lanes from
16240 // Qd (Op0) are demanded from a VMOVN, depending on whether we are inserting
16241 // into the top or bottom lanes.
16242 unsigned NumElts = N->getValueType(0).getVectorNumElements();
16247
16249 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
16251 KnownZero, DCI))
16252 return SDValue(N, 0);
16254 KnownZero, DCI))
16255 return SDValue(N, 0);
16256
16257 return SDValue();
16258}
16259
16262 SDValue Op0 = N->getOperand(0);
16263 unsigned IsTop = N->getConstantOperandVal(2);
16264
16265 unsigned NumElts = N->getValueType(0).getVectorNumElements();
16268 : APInt::getHighBitsSet(2, 1));
16269
16271 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
16273 KnownZero, DCI))
16274 return SDValue(N, 0);
16275 return SDValue();
16276}
16277
16279 SDLoc DL(N);
16280 SDValue Op0 = N->getOperand(0);
16281 SDValue Op1 = N->getOperand(1);
16282
16283 // Turn X << -C -> X >> C and viceversa. The negative shifts can come up from
16284 // uses of the intrinsics.
16285 if (auto C = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
16286 int ShiftAmt = C->getSExtValue();
16287 if (ShiftAmt == 0) {
16288 SDValue Merge = DAG.getMergeValues({Op0, Op1}, DL);
16289 DAG.ReplaceAllUsesWith(N, Merge.getNode());
16290 return SDValue();
16291 }
16292
16293 if (ShiftAmt >= -32 && ShiftAmt < 0) {
16294 unsigned NewOpcode =
16295 N->getOpcode() == ARMISD::LSLL ? ARMISD::LSRL : ARMISD::LSLL;
16296 SDValue NewShift = DAG.getNode(NewOpcode, DL, N->getVTList(), Op0, Op1,
16297 DAG.getConstant(-ShiftAmt, DL, MVT::i32));
16298 DAG.ReplaceAllUsesWith(N, NewShift.getNode());
16299 return NewShift;
16300 }
16301 }
16302
16303 return SDValue();
16304}
16305
16306/// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics.
16308 DAGCombinerInfo &DCI) const {
16309 SelectionDAG &DAG = DCI.DAG;
16310 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
16311 switch (IntNo) {
16312 default:
16313 // Don't do anything for most intrinsics.
16314 break;
16315
16316 // Vector shifts: check for immediate versions and lower them.
16317 // Note: This is done during DAG combining instead of DAG legalizing because
16318 // the build_vectors for 64-bit vector element shift counts are generally
16319 // not legal, and it is hard to see their values after they get legalized to
16320 // loads from a constant pool.
16321 case Intrinsic::arm_neon_vshifts:
16322 case Intrinsic::arm_neon_vshiftu:
16323 case Intrinsic::arm_neon_vrshifts:
16324 case Intrinsic::arm_neon_vrshiftu:
16325 case Intrinsic::arm_neon_vrshiftn:
16326 case Intrinsic::arm_neon_vqshifts:
16327 case Intrinsic::arm_neon_vqshiftu:
16328 case Intrinsic::arm_neon_vqshiftsu:
16329 case Intrinsic::arm_neon_vqshiftns:
16330 case Intrinsic::arm_neon_vqshiftnu:
16331 case Intrinsic::arm_neon_vqshiftnsu:
16332 case Intrinsic::arm_neon_vqrshiftns:
16333 case Intrinsic::arm_neon_vqrshiftnu:
16334 case Intrinsic::arm_neon_vqrshiftnsu: {
16335 EVT VT = N->getOperand(1).getValueType();
16336 int64_t Cnt;
16337 unsigned VShiftOpc = 0;
16338
16339 switch (IntNo) {
16340 case Intrinsic::arm_neon_vshifts:
16341 case Intrinsic::arm_neon_vshiftu:
16342 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) {
16344 break;
16345 }
16346 if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) {
16347 VShiftOpc = (IntNo == Intrinsic::arm_neon_vshifts ? ARMISD::VSHRsIMM
16349 break;
16350 }
16351 return SDValue();
16352
16353 case Intrinsic::arm_neon_vrshifts:
16354 case Intrinsic::arm_neon_vrshiftu:
16355 if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt))
16356 break;
16357 return SDValue();
16358
16359 case Intrinsic::arm_neon_vqshifts:
16360 case Intrinsic::arm_neon_vqshiftu:
16361 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))
16362 break;
16363 return SDValue();
16364
16365 case Intrinsic::arm_neon_vqshiftsu:
16366 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))
16367 break;
16368 llvm_unreachable("invalid shift count for vqshlu intrinsic");
16369
16370 case Intrinsic::arm_neon_vrshiftn:
16371 case Intrinsic::arm_neon_vqshiftns:
16372 case Intrinsic::arm_neon_vqshiftnu:
16373 case Intrinsic::arm_neon_vqshiftnsu:
16374 case Intrinsic::arm_neon_vqrshiftns:
16375 case Intrinsic::arm_neon_vqrshiftnu:
16376 case Intrinsic::arm_neon_vqrshiftnsu:
16377 // Narrowing shifts require an immediate right shift.
16378 if (isVShiftRImm(N->getOperand(2), VT, true, true, Cnt))
16379 break;
16380 llvm_unreachable("invalid shift count for narrowing vector shift "
16381 "intrinsic");
16382
16383 default:
16384 llvm_unreachable("unhandled vector shift");
16385 }
16386
16387 switch (IntNo) {
16388 case Intrinsic::arm_neon_vshifts:
16389 case Intrinsic::arm_neon_vshiftu:
16390 // Opcode already set above.
16391 break;
16392 case Intrinsic::arm_neon_vrshifts:
16394 break;
16395 case Intrinsic::arm_neon_vrshiftu:
16397 break;
16398 case Intrinsic::arm_neon_vrshiftn:
16400 break;
16401 case Intrinsic::arm_neon_vqshifts:
16403 break;
16404 case Intrinsic::arm_neon_vqshiftu:
16406 break;
16407 case Intrinsic::arm_neon_vqshiftsu:
16409 break;
16410 case Intrinsic::arm_neon_vqshiftns:
16412 break;
16413 case Intrinsic::arm_neon_vqshiftnu:
16415 break;
16416 case Intrinsic::arm_neon_vqshiftnsu:
16418 break;
16419 case Intrinsic::arm_neon_vqrshiftns:
16421 break;
16422 case Intrinsic::arm_neon_vqrshiftnu:
16424 break;
16425 case Intrinsic::arm_neon_vqrshiftnsu:
16427 break;
16428 }
16429
16430 SDLoc dl(N);
16431 return DAG.getNode(VShiftOpc, dl, N->getValueType(0),
16432 N->getOperand(1), DAG.getConstant(Cnt, dl, MVT::i32));
16433 }
16434
16435 case Intrinsic::arm_neon_vshiftins: {
16436 EVT VT = N->getOperand(1).getValueType();
16437 int64_t Cnt;
16438 unsigned VShiftOpc = 0;
16439
16440 if (isVShiftLImm(N->getOperand(3), VT, false, Cnt))
16442 else if (isVShiftRImm(N->getOperand(3), VT, false, true, Cnt))
16444 else {
16445 llvm_unreachable("invalid shift count for vsli/vsri intrinsic");
16446 }
16447
16448 SDLoc dl(N);
16449 return DAG.getNode(VShiftOpc, dl, N->getValueType(0),
16450 N->getOperand(1), N->getOperand(2),
16451 DAG.getConstant(Cnt, dl, MVT::i32));
16452 }
16453
16454 case Intrinsic::arm_neon_vqrshifts:
16455 case Intrinsic::arm_neon_vqrshiftu:
16456 // No immediate versions of these to check for.
16457 break;
16458
16459 case Intrinsic::arm_mve_vqdmlah:
16460 case Intrinsic::arm_mve_vqdmlash:
16461 case Intrinsic::arm_mve_vqrdmlah:
16462 case Intrinsic::arm_mve_vqrdmlash:
16463 case Intrinsic::arm_mve_vmla_n_predicated:
16464 case Intrinsic::arm_mve_vmlas_n_predicated:
16465 case Intrinsic::arm_mve_vqdmlah_predicated:
16466 case Intrinsic::arm_mve_vqdmlash_predicated:
16467 case Intrinsic::arm_mve_vqrdmlah_predicated:
16468 case Intrinsic::arm_mve_vqrdmlash_predicated: {
16469 // These intrinsics all take an i32 scalar operand which is narrowed to the
16470 // size of a single lane of the vector type they return. So we don't need
16471 // any bits of that operand above that point, which allows us to eliminate
16472 // uxth/sxth.
16473 unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();
16475 if (SimplifyDemandedBits(N->getOperand(3), DemandedMask, DCI))
16476 return SDValue();
16477 break;
16478 }
16479
16480 case Intrinsic::arm_mve_minv:
16481 case Intrinsic::arm_mve_maxv:
16482 case Intrinsic::arm_mve_minav:
16483 case Intrinsic::arm_mve_maxav:
16484 case Intrinsic::arm_mve_minv_predicated:
16485 case Intrinsic::arm_mve_maxv_predicated:
16486 case Intrinsic::arm_mve_minav_predicated:
16487 case Intrinsic::arm_mve_maxav_predicated: {
16488 // These intrinsics all take an i32 scalar operand which is narrowed to the
16489 // size of a single lane of the vector type they take as the other input.
16490 unsigned BitWidth = N->getOperand(2)->getValueType(0).getScalarSizeInBits();
16492 if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))
16493 return SDValue();
16494 break;
16495 }
16496
16497 case Intrinsic::arm_mve_addv: {
16498 // Turn this intrinsic straight into the appropriate ARMISD::VADDV node,
16499 // which allow PerformADDVecReduce to turn it into VADDLV when possible.
16500 bool Unsigned = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
16501 unsigned Opc = Unsigned ? ARMISD::VADDVu : ARMISD::VADDVs;
16502 return DAG.getNode(Opc, SDLoc(N), N->getVTList(), N->getOperand(1));
16503 }
16504
16505 case Intrinsic::arm_mve_addlv:
16506 case Intrinsic::arm_mve_addlv_predicated: {
16507 // Same for these, but ARMISD::VADDLV has to be followed by a BUILD_PAIR
16508 // which recombines the two outputs into an i64
16509 bool Unsigned = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
16510 unsigned Opc = IntNo == Intrinsic::arm_mve_addlv ?
16513
16515 for (unsigned i = 1, e = N->getNumOperands(); i < e; i++)
16516 if (i != 2) // skip the unsigned flag
16517 Ops.push_back(N->getOperand(i));
16518
16519 SDLoc dl(N);
16520 SDValue val = DAG.getNode(Opc, dl, {MVT::i32, MVT::i32}, Ops);
16521 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, val.getValue(0),
16522 val.getValue(1));
16523 }
16524 }
16525
16526 return SDValue();
16527}
16528
16529/// PerformShiftCombine - Checks for immediate versions of vector shifts and
16530/// lowers them. As with the vector shift intrinsics, this is done during DAG
16531/// combining instead of DAG legalizing because the build_vectors for 64-bit
16532/// vector element shift counts are generally not legal, and it is hard to see
16533/// their values after they get legalized to loads from a constant pool.
16536 const ARMSubtarget *ST) {
16537 SelectionDAG &DAG = DCI.DAG;
16538 EVT VT = N->getValueType(0);
16539 if (N->getOpcode() == ISD::SRL && VT == MVT::i32 && ST->hasV6Ops()) {
16540 // Canonicalize (srl (bswap x), 16) to (rotr (bswap x), 16) if the high
16541 // 16-bits of x is zero. This optimizes rev + lsr 16 to rev16.
16542 SDValue N1 = N->getOperand(1);
16544 SDValue N0 = N->getOperand(0);
16545 if (C->getZExtValue() == 16 && N0.getOpcode() == ISD::BSWAP &&
16546 DAG.MaskedValueIsZero(N0.getOperand(0),
16547 APInt::getHighBitsSet(32, 16)))
16548 return DAG.getNode(ISD::ROTR, SDLoc(N), VT, N0, N1);
16549 }
16550 }
16551
16552 if (ST->isThumb1Only() && N->getOpcode() == ISD::SHL && VT == MVT::i32 &&
16553 N->getOperand(0)->getOpcode() == ISD::AND &&
16554 N->getOperand(0)->hasOneUse()) {
16555 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
16556 return SDValue();
16557 // Look for the pattern (shl (and x, AndMask), ShiftAmt). This doesn't
16558 // usually show up because instcombine prefers to canonicalize it to
16559 // (and (shl x, ShiftAmt) (shl AndMask, ShiftAmt)), but the shift can come
16560 // out of GEP lowering in some cases.
16561 SDValue N0 = N->getOperand(0);
16563 if (!ShiftAmtNode)
16564 return SDValue();
16565 uint32_t ShiftAmt = static_cast<uint32_t>(ShiftAmtNode->getZExtValue());
16567 if (!AndMaskNode)
16568 return SDValue();
16569 uint32_t AndMask = static_cast<uint32_t>(AndMaskNode->getZExtValue());
16570 // Don't transform uxtb/uxth.
16571 if (AndMask == 255 || AndMask == 65535)
16572 return SDValue();
16573 if (isMask_32(AndMask)) {
16575 if (MaskedBits > ShiftAmt) {
16576 SDLoc DL(N);
16577 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
16579 return DAG.getNode(
16580 ISD::SRL, DL, MVT::i32, SHL,
16581 DAG.getConstant(MaskedBits - ShiftAmt, DL, MVT::i32));
16582 }
16583 }
16584 }
16585
16586 // Nothing to be done for scalar shifts.
16587 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16588 if (!VT.isVector() || !TLI.isTypeLegal(VT))
16589 return SDValue();
16590 if (ST->hasMVEIntegerOps() && VT == MVT::v2i64)
16591 return SDValue();
16592
16593 int64_t Cnt;
16594
16595 switch (N->getOpcode()) {
16596 default: llvm_unreachable("unexpected shift opcode");
16597
16598 case ISD::SHL:
16599 if (isVShiftLImm(N->getOperand(1), VT, false, Cnt)) {
16600 SDLoc dl(N);
16601 return DAG.getNode(ARMISD::VSHLIMM, dl, VT, N->getOperand(0),
16602 DAG.getConstant(Cnt, dl, MVT::i32));
16603 }
16604 break;
16605
16606 case ISD::SRA:
16607 case ISD::SRL:
16608 if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) {
16609 unsigned VShiftOpc =
16610 (N->getOpcode() == ISD::SRA ? ARMISD::VSHRsIMM : ARMISD::VSHRuIMM);
16611 SDLoc dl(N);
16612 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0),
16613 DAG.getConstant(Cnt, dl, MVT::i32));
16614 }
16615 }
16616 return SDValue();
16617}
16618
16619// Look for a sign/zero/fpextend extend of a larger than legal load. This can be
16620// split into multiple extending loads, which are simpler to deal with than an
16621// arbitrary extend. For fp extends we use an integer extending load and a VCVTL
16622// to convert the type to an f32.
16624 SDValue N0 = N->getOperand(0);
16625 if (N0.getOpcode() != ISD::LOAD)
16626 return SDValue();
16628 if (!LD->isSimple() || !N0.hasOneUse() || LD->isIndexed() ||
16629 LD->getExtensionType() != ISD::NON_EXTLOAD)
16630 return SDValue();
16631 EVT FromVT = LD->getValueType(0);
16632 EVT ToVT = N->getValueType(0);
16633 if (!ToVT.isVector())
16634 return SDValue();
16635 assert(FromVT.getVectorNumElements() == ToVT.getVectorNumElements());
16636 EVT ToEltVT = ToVT.getVectorElementType();
16637 EVT FromEltVT = FromVT.getVectorElementType();
16638
16639 unsigned NumElements = 0;
16640 if (ToEltVT == MVT::i32 && FromEltVT == MVT::i8)
16641 NumElements = 4;
16642 if (ToEltVT == MVT::f32 && FromEltVT == MVT::f16)
16643 NumElements = 4;
16644 if (NumElements == 0 ||
16645 (FromEltVT != MVT::f16 && FromVT.getVectorNumElements() == NumElements) ||
16646 FromVT.getVectorNumElements() % NumElements != 0 ||
16647 !isPowerOf2_32(NumElements))
16648 return SDValue();
16649
16650 LLVMContext &C = *DAG.getContext();
16651 SDLoc DL(LD);
16652 // Details about the old load
16653 SDValue Ch = LD->getChain();
16654 SDValue BasePtr = LD->getBasePtr();
16655 Align Alignment = LD->getOriginalAlign();
16656 MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
16657 AAMDNodes AAInfo = LD->getAAInfo();
16658
16660 N->getOpcode() == ISD::SIGN_EXTEND ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
16661 SDValue Offset = DAG.getUNDEF(BasePtr.getValueType());
16663 C, EVT::getIntegerVT(C, FromEltVT.getScalarSizeInBits()), NumElements);
16665 C, EVT::getIntegerVT(C, ToEltVT.getScalarSizeInBits()), NumElements);
16666
16669 for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {
16670 unsigned NewOffset = (i * NewFromVT.getSizeInBits()) / 8;
16671 SDValue NewPtr =
16673
16676 LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT,
16677 Alignment, MMOFlags, AAInfo);
16678 Loads.push_back(NewLoad);
16679 Chains.push_back(SDValue(NewLoad.getNode(), 1));
16680 }
16681
16682 // Float truncs need to extended with VCVTB's into their floating point types.
16683 if (FromEltVT == MVT::f16) {
16685
16686 for (unsigned i = 0; i < Loads.size(); i++) {
16687 SDValue LoadBC =
16690 DAG.getConstant(0, DL, MVT::i32));
16691 Extends.push_back(FPExt);
16692 }
16693
16694 Loads = Extends;
16695 }
16696
16699 return DAG.getNode(ISD::CONCAT_VECTORS, DL, ToVT, Loads);
16700}
16701
16702/// PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND,
16703/// ISD::ZERO_EXTEND, and ISD::ANY_EXTEND.
16705 const ARMSubtarget *ST) {
16706 SDValue N0 = N->getOperand(0);
16707
16708 // Check for sign- and zero-extensions of vector extract operations of 8- and
16709 // 16-bit vector elements. NEON and MVE support these directly. They are
16710 // handled during DAG combining because type legalization will promote them
16711 // to 32-bit types and it is messy to recognize the operations after that.
16712 if ((ST->hasNEON() || ST->hasMVEIntegerOps()) &&
16714 SDValue Vec = N0.getOperand(0);
16715 SDValue Lane = N0.getOperand(1);
16716 EVT VT = N->getValueType(0);
16717 EVT EltVT = N0.getValueType();
16718 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16719
16720 if (VT == MVT::i32 &&
16721 (EltVT == MVT::i8 || EltVT == MVT::i16) &&
16722 TLI.isTypeLegal(Vec.getValueType()) &&
16723 isa<ConstantSDNode>(Lane)) {
16724
16725 unsigned Opc = 0;
16726 switch (N->getOpcode()) {
16727 default: llvm_unreachable("unexpected opcode");
16728 case ISD::SIGN_EXTEND:
16729 Opc = ARMISD::VGETLANEs;
16730 break;
16731 case ISD::ZERO_EXTEND:
16732 case ISD::ANY_EXTEND:
16733 Opc = ARMISD::VGETLANEu;
16734 break;
16735 }
16736 return DAG.getNode(Opc, SDLoc(N), VT, Vec, Lane);
16737 }
16738 }
16739
16740 if (ST->hasMVEIntegerOps())
16742 return NewLoad;
16743
16744 return SDValue();
16745}
16746
16748 const ARMSubtarget *ST) {
16749 if (ST->hasMVEFloatOps())
16751 return NewLoad;
16752
16753 return SDValue();
16754}
16755
16756/// PerformMinMaxCombine - Target-specific DAG combining for creating truncating
16757/// saturates.
16759 const ARMSubtarget *ST) {
16760 EVT VT = N->getValueType(0);
16761 SDValue N0 = N->getOperand(0);
16762 if (!ST->hasMVEIntegerOps())
16763 return SDValue();
16764
16765 if (SDValue V = PerformVQDMULHCombine(N, DAG))
16766 return V;
16767
16768 if (VT != MVT::v4i32 && VT != MVT::v8i16)
16769 return SDValue();
16770
16771 auto IsSignedSaturate = [&](SDNode *Min, SDNode *Max) {
16772 // Check one is a smin and the other is a smax
16773 if (Min->getOpcode() != ISD::SMIN)
16774 std::swap(Min, Max);
16775 if (Min->getOpcode() != ISD::SMIN || Max->getOpcode() != ISD::SMAX)
16776 return false;
16777
16779 if (VT == MVT::v4i32)
16780 SaturateC = APInt(32, (1 << 15) - 1, true);
16781 else //if (VT == MVT::v8i16)
16782 SaturateC = APInt(16, (1 << 7) - 1, true);
16783
16784 APInt MinC, MaxC;
16786 MinC != SaturateC)
16787 return false;
16788 if (!ISD::isConstantSplatVector(Max->getOperand(1).getNode(), MaxC) ||
16789 MaxC != ~SaturateC)
16790 return false;
16791 return true;
16792 };
16793
16794 if (IsSignedSaturate(N, N0.getNode())) {
16795 SDLoc DL(N);
16796 MVT ExtVT, HalfVT;
16797 if (VT == MVT::v4i32) {
16799 ExtVT = MVT::v4i16;
16800 } else { // if (VT == MVT::v8i16)
16802 ExtVT = MVT::v8i8;
16803 }
16804
16805 // Create a VQMOVNB with undef top lanes, then signed extended into the top
16806 // half. That extend will hopefully be removed if only the bottom bits are
16807 // demanded (though a truncating store, for example).
16808 SDValue VQMOVN =
16810 N0->getOperand(0), DAG.getConstant(0, DL, MVT::i32));
16811 SDValue Bitcast = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, VQMOVN);
16812 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Bitcast,
16813 DAG.getValueType(ExtVT));
16814 }
16815
16816 auto IsUnsignedSaturate = [&](SDNode *Min) {
16817 // For unsigned, we just need to check for <= 0xffff
16818 if (Min->getOpcode() != ISD::UMIN)
16819 return false;
16820
16822 if (VT == MVT::v4i32)
16823 SaturateC = APInt(32, (1 << 16) - 1, true);
16824 else //if (VT == MVT::v8i16)
16825 SaturateC = APInt(16, (1 << 8) - 1, true);
16826
16827 APInt MinC;
16829 MinC != SaturateC)
16830 return false;
16831 return true;
16832 };
16833
16834 if (IsUnsignedSaturate(N)) {
16835 SDLoc DL(N);
16836 MVT HalfVT;
16837 unsigned ExtConst;
16838 if (VT == MVT::v4i32) {
16840 ExtConst = 0x0000FFFF;
16841 } else { //if (VT == MVT::v8i16)
16843 ExtConst = 0x00FF;
16844 }
16845
16846 // Create a VQMOVNB with undef top lanes, then ZExt into the top half with
16847 // an AND. That extend will hopefully be removed if only the bottom bits are
16848 // demanded (though a truncating store, for example).
16849 SDValue VQMOVN =
16851 DAG.getConstant(0, DL, MVT::i32));
16852 SDValue Bitcast = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, VQMOVN);
16853 return DAG.getNode(ISD::AND, DL, VT, Bitcast,
16854 DAG.getConstant(ExtConst, DL, VT));
16855 }
16856
16857 return SDValue();
16858}
16859
16862 if (!C)
16863 return nullptr;
16864 const APInt *CV = &C->getAPIntValue();
16865 return CV->isPowerOf2() ? CV : nullptr;
16866}
16867
16869 // If we have a CMOV, OR and AND combination such as:
16870 // if (x & CN)
16871 // y |= CM;
16872 //
16873 // And:
16874 // * CN is a single bit;
16875 // * All bits covered by CM are known zero in y
16876 //
16877 // Then we can convert this into a sequence of BFI instructions. This will
16878 // always be a win if CM is a single bit, will always be no worse than the
16879 // TST&OR sequence if CM is two bits, and for thumb will be no worse if CM is
16880 // three bits (due to the extra IT instruction).
16881
16882 SDValue Op0 = CMOV->getOperand(0);
16883 SDValue Op1 = CMOV->getOperand(1);
16884 auto CCNode = cast<ConstantSDNode>(CMOV->getOperand(2));
16885 auto CC = CCNode->getAPIntValue().getLimitedValue();
16886 SDValue CmpZ = CMOV->getOperand(4);
16887
16888 // The compare must be against zero.
16889 if (!isNullConstant(CmpZ->getOperand(1)))
16890 return SDValue();
16891
16892 assert(CmpZ->getOpcode() == ARMISD::CMPZ);
16893 SDValue And = CmpZ->getOperand(0);
16894 if (And->getOpcode() != ISD::AND)
16895 return SDValue();
16896 const APInt *AndC = isPowerOf2Constant(And->getOperand(1));
16897 if (!AndC)
16898 return SDValue();
16899 SDValue X = And->getOperand(0);
16900
16901 if (CC == ARMCC::EQ) {
16902 // We're performing an "equal to zero" compare. Swap the operands so we
16903 // canonicalize on a "not equal to zero" compare.
16904 std::swap(Op0, Op1);
16905 } else {
16906 assert(CC == ARMCC::NE && "How can a CMPZ node not be EQ or NE?");
16907 }
16908
16909 if (Op1->getOpcode() != ISD::OR)
16910 return SDValue();
16911
16913 if (!OrC)
16914 return SDValue();
16915 SDValue Y = Op1->getOperand(0);
16916
16917 if (Op0 != Y)
16918 return SDValue();
16919
16920 // Now, is it profitable to continue?
16921 APInt OrCI = OrC->getAPIntValue();
16922 unsigned Heuristic = Subtarget->isThumb() ? 3 : 2;
16923 if (OrCI.countPopulation() > Heuristic)
16924 return SDValue();
16925
16926 // Lastly, can we determine that the bits defined by OrCI
16927 // are zero in Y?
16928 KnownBits Known = DAG.computeKnownBits(Y);
16929 if ((OrCI & Known.Zero) != OrCI)
16930 return SDValue();
16931
16932 // OK, we can do the combine.
16933 SDValue V = Y;
16934 SDLoc dl(X);
16935 EVT VT = X.getValueType();
16936 unsigned BitInX = AndC->logBase2();
16937
16938 if (BitInX != 0) {
16939 // We must shift X first.
16940 X = DAG.getNode(ISD::SRL, dl, VT, X,
16941 DAG.getConstant(BitInX, dl, VT));
16942 }
16943
16944 for (unsigned BitInY = 0, NumActiveBits = OrCI.getActiveBits();
16946 if (OrCI[BitInY] == 0)
16947 continue;
16948 APInt Mask(VT.getSizeInBits(), 0);
16949 Mask.setBit(BitInY);
16950 V = DAG.getNode(ARMISD::BFI, dl, VT, V, X,
16951 // Confusingly, the operand is an *inverted* mask.
16952 DAG.getConstant(~Mask, dl, VT));
16953 }
16954
16955 return V;
16956}
16957
16958// Given N, the value controlling the conditional branch, search for the loop
16959// intrinsic, returning it, along with how the value is used. We need to handle
16960// patterns such as the following:
16961// (brcond (xor (setcc (loop.decrement), 0, ne), 1), exit)
16962// (brcond (setcc (loop.decrement), 0, eq), exit)
16963// (brcond (setcc (loop.decrement), 0, ne), header)
16965 bool &Negate) {
16966 switch (N->getOpcode()) {
16967 default:
16968 break;
16969 case ISD::XOR: {
16970 if (!isa<ConstantSDNode>(N.getOperand(1)))
16971 return SDValue();
16972 if (!cast<ConstantSDNode>(N.getOperand(1))->isOne())
16973 return SDValue();
16974 Negate = !Negate;
16975 return SearchLoopIntrinsic(N.getOperand(0), CC, Imm, Negate);
16976 }
16977 case ISD::SETCC: {
16978 auto *Const = dyn_cast<ConstantSDNode>(N.getOperand(1));
16979 if (!Const)
16980 return SDValue();
16981 if (Const->isNullValue())
16982 Imm = 0;
16983 else if (Const->isOne())
16984 Imm = 1;
16985 else
16986 return SDValue();
16987 CC = cast<CondCodeSDNode>(N.getOperand(2))->get();
16988 return SearchLoopIntrinsic(N->getOperand(0), CC, Imm, Negate);
16989 }
16991 unsigned IntOp = cast<ConstantSDNode>(N.getOperand(1))->getZExtValue();
16992 if (IntOp != Intrinsic::test_start_loop_iterations &&
16993 IntOp != Intrinsic::loop_decrement_reg)
16994 return SDValue();
16995 return N;
16996 }
16997 }
16998 return SDValue();
16999}
17000
17003 const ARMSubtarget *ST) {
17004
17005 // The hwloop intrinsics that we're interested are used for control-flow,
17006 // either for entering or exiting the loop:
17007 // - test.start.loop.iterations will test whether its operand is zero. If it
17008 // is zero, the proceeding branch should not enter the loop.
17009 // - loop.decrement.reg also tests whether its operand is zero. If it is
17010 // zero, the proceeding branch should not branch back to the beginning of
17011 // the loop.
17012 // So here, we need to check that how the brcond is using the result of each
17013 // of the intrinsics to ensure that we're branching to the right place at the
17014 // right time.
17015
17016 ISD::CondCode CC;
17017 SDValue Cond;
17018 int Imm = 1;
17019 bool Negate = false;
17020 SDValue Chain = N->getOperand(0);
17021 SDValue Dest;
17022
17023 if (N->getOpcode() == ISD::BRCOND) {
17024 CC = ISD::SETEQ;
17025 Cond = N->getOperand(1);
17026 Dest = N->getOperand(2);
17027 } else {
17028 assert(N->getOpcode() == ISD::BR_CC && "Expected BRCOND or BR_CC!");
17029 CC = cast<CondCodeSDNode>(N->getOperand(1))->get();
17030 Cond = N->getOperand(2);
17031 Dest = N->getOperand(4);
17032 if (auto *Const = dyn_cast<ConstantSDNode>(N->getOperand(3))) {
17033 if (!Const->isOne() && !Const->isNullValue())
17034 return SDValue();
17035 Imm = Const->getZExtValue();
17036 } else
17037 return SDValue();
17038 }
17039
17040 SDValue Int = SearchLoopIntrinsic(Cond, CC, Imm, Negate);
17041 if (!Int)
17042 return SDValue();
17043
17044 if (Negate)
17045 CC = ISD::getSetCCInverse(CC, /* Integer inverse */ MVT::i32);
17046
17047 auto IsTrueIfZero = [](ISD::CondCode CC, int Imm) {
17048 return (CC == ISD::SETEQ && Imm == 0) ||
17049 (CC == ISD::SETNE && Imm == 1) ||
17050 (CC == ISD::SETLT && Imm == 1) ||
17051 (CC == ISD::SETULT && Imm == 1);
17052 };
17053
17054 auto IsFalseIfZero = [](ISD::CondCode CC, int Imm) {
17055 return (CC == ISD::SETEQ && Imm == 1) ||
17056 (CC == ISD::SETNE && Imm == 0) ||
17057 (CC == ISD::SETGT && Imm == 0) ||
17058 (CC == ISD::SETUGT && Imm == 0) ||
17059 (CC == ISD::SETGE && Imm == 1) ||
17060 (CC == ISD::SETUGE && Imm == 1);
17061 };
17062
17063 assert((IsTrueIfZero(CC, Imm) || IsFalseIfZero(CC, Imm)) &&
17064 "unsupported condition");
17065
17066 SDLoc dl(Int);
17067 SelectionDAG &DAG = DCI.DAG;
17068 SDValue Elements = Int.getOperand(2);
17069 unsigned IntOp = cast<ConstantSDNode>(Int->getOperand(1))->getZExtValue();
17070 assert((N->hasOneUse() && N->use_begin()->getOpcode() == ISD::BR)
17071 && "expected single br user");
17072 SDNode *Br = *N->use_begin();
17074
17075 // Update the unconditional branch to branch to the given Dest.
17076 auto UpdateUncondBr = [](SDNode *Br, SDValue Dest, SelectionDAG &DAG) {
17077 SDValue NewBrOps[] = { Br->getOperand(0), Dest };
17080 };
17081
17082 if (IntOp == Intrinsic::test_start_loop_iterations) {
17083 SDValue Res;
17084 SDValue Setup = DAG.getNode(ARMISD::WLSSETUP, dl, MVT::i32, Elements);
17085 // We expect this 'instruction' to branch when the counter is zero.
17086 if (IsTrueIfZero(CC, Imm)) {
17087 SDValue Ops[] = {Chain, Setup, Dest};
17088 Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops);
17089 } else {
17090 // The logic is the reverse of what we need for WLS, so find the other
17091 // basic block target: the target of the proceeding br.
17092 UpdateUncondBr(Br, Dest, DAG);
17093
17094 SDValue Ops[] = {Chain, Setup, OtherTarget};
17095 Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops);
17096 }
17097 // Update LR count to the new value
17098 DAG.ReplaceAllUsesOfValueWith(Int.getValue(0), Setup);
17099 // Update chain
17100 DAG.ReplaceAllUsesOfValueWith(Int.getValue(2), Int.getOperand(0));
17101 return Res;
17102 } else {
17104 cast<ConstantSDNode>(Int.getOperand(3))->getZExtValue(), dl, MVT::i32);
17105 SDValue Args[] = { Int.getOperand(0), Elements, Size, };
17107 DAG.getVTList(MVT::i32, MVT::Other), Args);
17108 DAG.ReplaceAllUsesWith(Int.getNode(), LoopDec.getNode());
17109
17110 // We expect this instruction to branch when the count is not zero.
17111 SDValue Target = IsFalseIfZero(CC, Imm) ? Dest : OtherTarget;
17112
17113 // Update the unconditional branch to target the loop preheader if we've
17114 // found the condition has been reversed.
17115 if (Target == OtherTarget)
17116 UpdateUncondBr(Br, Dest, DAG);
17117
17118 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
17119 SDValue(LoopDec.getNode(), 1), Chain);
17120
17121 SDValue EndArgs[] = { Chain, SDValue(LoopDec.getNode(), 0), Target };
17122 return DAG.getNode(ARMISD::LE, dl, MVT::Other, EndArgs);
17123 }
17124 return SDValue();
17125}
17126
17127/// PerformBRCONDCombine - Target-specific DAG combining for ARMISD::BRCOND.
17128SDValue
17130 SDValue Cmp = N->getOperand(4);
17131 if (Cmp.getOpcode() != ARMISD::CMPZ)
17132 // Only looking at NE cases.
17133 return SDValue();
17134
17135 EVT VT = N->getValueType(0);
17136 SDLoc dl(N);
17137 SDValue LHS = Cmp.getOperand(0);
17138 SDValue RHS = Cmp.getOperand(1);
17139 SDValue Chain = N->getOperand(0);
17140 SDValue BB = N->getOperand(1);
17141 SDValue ARMcc = N->getOperand(2);
17142 ARMCC::CondCodes CC =
17144
17145 // (brcond Chain BB ne CPSR (cmpz (and (cmov 0 1 CC CPSR Cmp) 1) 0))
17146 // -> (brcond Chain BB CC CPSR Cmp)
17147 if (CC == ARMCC::NE && LHS.getOpcode() == ISD::AND && LHS->hasOneUse() &&
17148 LHS->getOperand(0)->getOpcode() == ARMISD::CMOV &&
17149 LHS->getOperand(0)->hasOneUse()) {
17150 auto *LHS00C = dyn_cast<ConstantSDNode>(LHS->getOperand(0)->getOperand(0));
17151 auto *LHS01C = dyn_cast<ConstantSDNode>(LHS->getOperand(0)->getOperand(1));
17152 auto *LHS1C = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
17153 auto *RHSC = dyn_cast<ConstantSDNode>(RHS);
17154 if ((LHS00C && LHS00C->getZExtValue() == 0) &&
17155 (LHS01C && LHS01C->getZExtValue() == 1) &&
17156 (LHS1C && LHS1C->getZExtValue() == 1) &&
17157 (RHSC && RHSC->getZExtValue() == 0)) {
17158 return DAG.getNode(
17159 ARMISD::BRCOND, dl, VT, Chain, BB, LHS->getOperand(0)->getOperand(2),
17160 LHS->getOperand(0)->getOperand(3), LHS->getOperand(0)->getOperand(4));
17161 }
17162 }
17163
17164 return SDValue();
17165}
17166
17167/// PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV.
17168SDValue
17170 SDValue Cmp = N->getOperand(4);
17171 if (Cmp.getOpcode() != ARMISD::CMPZ)
17172 // Only looking at EQ and NE cases.
17173 return SDValue();
17174
17175 EVT VT = N->getValueType(0);
17176 SDLoc dl(N);
17177 SDValue LHS = Cmp.getOperand(0);
17178 SDValue RHS = Cmp.getOperand(1);
17179 SDValue FalseVal = N->getOperand(0);
17180 SDValue TrueVal = N->getOperand(1);
17181 SDValue ARMcc = N->getOperand(2);
17182 ARMCC::CondCodes CC =
17184
17185 // BFI is only available on V6T2+.
17186 if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops()) {
17188 if (R)
17189 return R;
17190 }
17191
17192 // Simplify
17193 // mov r1, r0
17194 // cmp r1, x
17195 // mov r0, y
17196 // moveq r0, x
17197 // to
17198 // cmp r0, x
17199 // movne r0, y
17200 //
17201 // mov r1, r0
17202 // cmp r1, x
17203 // mov r0, x
17204 // movne r0, y
17205 // to
17206 // cmp r0, x
17207 // movne r0, y
17208 /// FIXME: Turn this into a target neutral optimization?
17209 SDValue Res;
17210 if (CC == ARMCC::NE && FalseVal == RHS && FalseVal != LHS) {
17211 Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, TrueVal, ARMcc,
17212 N->getOperand(3), Cmp);
17213 } else if (CC == ARMCC::EQ && TrueVal == RHS) {
17214 SDValue ARMcc;
17215 SDValue NewCmp = getARMCmp(LHS, RHS, ISD::SETNE, ARMcc, DAG, dl);
17216 Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, FalseVal, ARMcc,
17217 N->getOperand(3), NewCmp);
17218 }
17219
17220 // (cmov F T ne CPSR (cmpz (cmov 0 1 CC CPSR Cmp) 0))
17221 // -> (cmov F T CC CPSR Cmp)
17222 if (CC == ARMCC::NE && LHS.getOpcode() == ARMISD::CMOV && LHS->hasOneUse()) {
17223 auto *LHS0C = dyn_cast<ConstantSDNode>(LHS->getOperand(0));
17224 auto *LHS1C = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
17225 auto *RHSC = dyn_cast<ConstantSDNode>(RHS);
17226 if ((LHS0C && LHS0C->getZExtValue() == 0) &&
17227 (LHS1C && LHS1C->getZExtValue() == 1) &&
17228 (RHSC && RHSC->getZExtValue() == 0)) {
17229 return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal,
17230 LHS->getOperand(2), LHS->getOperand(3),
17231 LHS->getOperand(4));
17232 }
17233 }
17234
17235 if (!VT.isInteger())
17236 return SDValue();
17237
17238 // Materialize a boolean comparison for integers so we can avoid branching.
17239 if (isNullConstant(FalseVal)) {
17240 if (CC == ARMCC::EQ && isOneConstant(TrueVal)) {
17241 if (!Subtarget->isThumb1Only() && Subtarget->hasV5TOps()) {
17242 // If x == y then x - y == 0 and ARM's CLZ will return 32, shifting it
17243 // right 5 bits will make that 32 be 1, otherwise it will be 0.
17244 // CMOV 0, 1, ==, (CMPZ x, y) -> SRL (CTLZ (SUB x, y)), 5
17245 SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);
17246 Res = DAG.getNode(ISD::SRL, dl, VT, DAG.getNode(ISD::CTLZ, dl, VT, Sub),
17247 DAG.getConstant(5, dl, MVT::i32));
17248 } else {
17249 // CMOV 0, 1, ==, (CMPZ x, y) ->
17250 // (ADDCARRY (SUB x, y), t:0, t:1)
17251 // where t = (SUBCARRY 0, (SUB x, y), 0)
17252 //
17253 // The SUBCARRY computes 0 - (x - y) and this will give a borrow when
17254 // x != y. In other words, a carry C == 1 when x == y, C == 0
17255 // otherwise.
17256 // The final ADDCARRY computes
17257 // x - y + (0 - (x - y)) + C == C
17258 SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);
17259 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
17260 SDValue Neg = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, Sub);
17261 // ISD::SUBCARRY returns a borrow but we want the carry here
17262 // actually.
17263 SDValue Carry =
17264 DAG.getNode(ISD::SUB, dl, MVT::i32,
17265 DAG.getConstant(1, dl, MVT::i32), Neg.getValue(1));
17266 Res = DAG.getNode(ISD::ADDCARRY, dl, VTs, Sub, Neg, Carry);
17267 }
17268 } else if (CC == ARMCC::NE && !isNullConstant(RHS) &&
17269 (!Subtarget->isThumb1Only() || isPowerOf2Constant(TrueVal))) {
17270 // This seems pointless but will allow us to combine it further below.
17271 // CMOV 0, z, !=, (CMPZ x, y) -> CMOV (SUBS x, y), z, !=, (SUBS x, y):1
17272 SDValue Sub =
17273 DAG.getNode(ARMISD::SUBS, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS);
17274 SDValue CPSRGlue = DAG.getCopyToReg(DAG.getEntryNode(), dl, ARM::CPSR,
17275 Sub.getValue(1), SDValue());
17276 Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, TrueVal, ARMcc,
17277 N->getOperand(3), CPSRGlue.getValue(1));
17278 FalseVal = Sub;
17279 }
17280 } else if (isNullConstant(TrueVal)) {
17281 if (CC == ARMCC::EQ && !isNullConstant(RHS) &&
17282 (!Subtarget->isThumb1Only() || isPowerOf2Constant(FalseVal))) {
17283 // This seems pointless but will allow us to combine it further below
17284 // Note that we change == for != as this is the dual for the case above.
17285 // CMOV z, 0, ==, (CMPZ x, y) -> CMOV (SUBS x, y), z, !=, (SUBS x, y):1
17286 SDValue Sub =
17287 DAG.getNode(ARMISD::SUBS, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS);
17288 SDValue CPSRGlue = DAG.getCopyToReg(DAG.getEntryNode(), dl, ARM::CPSR,
17289 Sub.getValue(1), SDValue());
17290 Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, FalseVal,
17291 DAG.getConstant(ARMCC::NE, dl, MVT::i32),
17292 N->getOperand(3), CPSRGlue.getValue(1));
17293 FalseVal = Sub;
17294 }
17295 }
17296
17297 // On Thumb1, the DAG above may be further combined if z is a power of 2
17298 // (z == 2 ^ K).
17299 // CMOV (SUBS x, y), z, !=, (SUBS x, y):1 ->
17300 // t1 = (USUBO (SUB x, y), 1)
17301 // t2 = (SUBCARRY (SUB x, y), t1:0, t1:1)
17302 // Result = if K != 0 then (SHL t2:0, K) else t2:0
17303 //
17304 // This also handles the special case of comparing against zero; it's
17305 // essentially, the same pattern, except there's no SUBS:
17306 // CMOV x, z, !=, (CMPZ x, 0) ->
17307 // t1 = (USUBO x, 1)
17308 // t2 = (SUBCARRY x, t1:0, t1:1)
17309 // Result = if K != 0 then (SHL t2:0, K) else t2:0
17310 const APInt *TrueConst;
17311 if (Subtarget->isThumb1Only() && CC == ARMCC::NE &&
17312 ((FalseVal.getOpcode() == ARMISD::SUBS &&
17313 FalseVal.getOperand(0) == LHS && FalseVal.getOperand(1) == RHS) ||
17314 (FalseVal == LHS && isNullConstant(RHS))) &&
17315 (TrueConst = isPowerOf2Constant(TrueVal))) {
17316 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
17317 unsigned ShiftAmount = TrueConst->logBase2();
17318 if (ShiftAmount)
17319 TrueVal = DAG.getConstant(1, dl, VT);
17320 SDValue Subc = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, TrueVal);
17321 Res = DAG.getNode(ISD::SUBCARRY, dl, VTs, FalseVal, Subc, Subc.getValue(1));
17322
17323 if (ShiftAmount)
17324 Res = DAG.getNode(ISD::SHL, dl, VT, Res,
17325 DAG.getConstant(ShiftAmount, dl, MVT::i32));
17326 }
17327
17328 if (Res.getNode()) {
17329 KnownBits Known = DAG.computeKnownBits(SDValue(N,0));
17330 // Capture demanded bits information that would be otherwise lost.
17331 if (Known.Zero == 0xfffffffe)
17332 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
17333 DAG.getValueType(MVT::i1));
17334 else if (Known.Zero == 0xffffff00)
17335 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
17336 DAG.getValueType(MVT::i8));
17337 else if (Known.Zero == 0xffff0000)
17338 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
17339 DAG.getValueType(MVT::i16));
17340 }
17341
17342 return Res;
17343}
17344
17347 const ARMSubtarget *ST) {
17348 SelectionDAG &DAG = DCI.DAG;
17349 SDValue Src = N->getOperand(0);
17350 EVT DstVT = N->getValueType(0);
17351
17352 // Convert v4f32 bitcast (v4i32 vdup (i32)) -> v4f32 vdup (i32) under MVE.
17353 if (ST->hasMVEIntegerOps() && Src.getOpcode() == ARMISD::VDUP) {
17354 EVT SrcVT = Src.getValueType();
17355 if (SrcVT.getScalarSizeInBits() == DstVT.getScalarSizeInBits())
17356 return DAG.getNode(ARMISD::VDUP, SDLoc(N), DstVT, Src.getOperand(0));
17357 }
17358
17359 // We may have a bitcast of something that has already had this bitcast
17360 // combine performed on it, so skip past any VECTOR_REG_CASTs.
17361 while (Src.getOpcode() == ARMISD::VECTOR_REG_CAST)
17362 Src = Src.getOperand(0);
17363
17364 // Bitcast from element-wise VMOV or VMVN doesn't need VREV if the VREV that
17365 // would be generated is at least the width of the element type.
17366 EVT SrcVT = Src.getValueType();
17367 if ((Src.getOpcode() == ARMISD::VMOVIMM ||
17368 Src.getOpcode() == ARMISD::VMVNIMM ||
17369 Src.getOpcode() == ARMISD::VMOVFPIMM) &&
17370 SrcVT.getScalarSizeInBits() <= DstVT.getScalarSizeInBits() &&
17371 DAG.getDataLayout().isBigEndian())
17372 return DAG.getNode(ARMISD::VECTOR_REG_CAST, SDLoc(N), DstVT, Src);
17373
17374 // bitcast(extract(x, n)); bitcast(extract(x, n+1)) -> VMOVRRD x
17376 return R;
17377
17378 return SDValue();
17379}
17380
17381// Some combines for the MVETrunc truncations legalizer helper. Also lowers the
17382// node into stack operations after legalizeOps.
17385 SelectionDAG &DAG = DCI.DAG;
17386 EVT VT = N->getValueType(0);
17387 SDLoc DL(N);
17388
17389 // MVETrunc(Undef, Undef) -> Undef
17390 if (all_of(N->ops(), [](SDValue Op) { return Op.isUndef(); }))
17391 return DAG.getUNDEF(VT);
17392
17393 // MVETrunc(MVETrunc a b, MVETrunc c, d) -> MVETrunc
17394 if (N->getNumOperands() == 2 &&
17395 N->getOperand(0).getOpcode() == ARMISD::MVETRUNC &&
17396 N->getOperand(1).getOpcode() == ARMISD::MVETRUNC)
17397 return DAG.getNode(ARMISD::MVETRUNC, DL, VT, N->getOperand(0).getOperand(0),
17398 N->getOperand(0).getOperand(1),
17399 N->getOperand(1).getOperand(0),
17400 N->getOperand(1).getOperand(1));
17401
17402 // MVETrunc(shuffle, shuffle) -> VMOVN
17403 if (N->getNumOperands() == 2 &&
17404 N->getOperand(0).getOpcode() == ISD::VECTOR_SHUFFLE &&
17405 N->getOperand(1).getOpcode() == ISD::VECTOR_SHUFFLE) {
17406 auto *S0 = cast<ShuffleVectorSDNode>(N->getOperand(0).getNode());
17407 auto *S1 = cast<ShuffleVectorSDNode>(N->getOperand(1).getNode());
17408
17409 if (S0->getOperand(0) == S1->getOperand(0) &&
17410 S0->getOperand(1) == S1->getOperand(1)) {
17411 // Construct complete shuffle mask
17412 SmallVector<int, 8> Mask(S0->getMask().begin(), S0->getMask().end());
17413 Mask.append(S1->getMask().begin(), S1->getMask().end());
17414
17415 if (isVMOVNTruncMask(Mask, VT, 0))
17416 return DAG.getNode(
17417 ARMISD::VMOVN, DL, VT,
17418 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(0)),
17419 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(1)),
17420 DAG.getConstant(1, DL, MVT::i32));
17421 if (isVMOVNTruncMask(Mask, VT, 1))
17422 return DAG.getNode(
17423 ARMISD::VMOVN, DL, VT,
17424 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(1)),
17425 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(0)),
17426 DAG.getConstant(1, DL, MVT::i32));
17427 }
17428 }
17429
17430 // For MVETrunc of a buildvector or shuffle, it can be beneficial to lower the
17431 // truncate to a buildvector to allow the generic optimisations to kick in.
17432 if (all_of(N->ops(), [](SDValue Op) {
17433 return Op.getOpcode() == ISD::BUILD_VECTOR ||
17434 Op.getOpcode() == ISD::VECTOR_SHUFFLE ||
17435 (Op.getOpcode() == ISD::BITCAST &&
17436 Op.getOperand(0).getOpcode() == ISD::BUILD_VECTOR);
17437 })) {
17439 for (unsigned Op = 0; Op < N->getNumOperands(); Op++) {
17440 SDValue O = N->getOperand(Op);
17441 for (unsigned i = 0; i < O.getValueType().getVectorNumElements(); i++) {
17443 DAG.getConstant(i, DL, MVT::i32));
17444 Extracts.push_back(Ext);
17445 }
17446 }
17447 return DAG.getBuildVector(VT, DL, Extracts);
17448 }
17449
17450 // If we are late in the legalization process and nothing has optimised
17451 // the trunc to anything better, lower it to a stack store and reload,
17452 // performing the truncation whilst keeping the lanes in the correct order:
17453 // VSTRH.32 a, stack; VSTRH.32 b, stack+8; VLDRW.32 stack;
17454 if (!DCI.isAfterLegalizeDAG())
17455 return SDValue();
17456
17457 SDValue StackPtr = DAG.CreateStackTemporary(TypeSize::Fixed(16), Align(4));
17458 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
17459 int NumIns = N->getNumOperands();
17460 assert((NumIns == 2 || NumIns == 4) &&
17461 "Expected 2 or 4 inputs to an MVETrunc");
17463 if (N->getNumOperands() == 4)
17464 StoreVT = StoreVT.getHalfNumVectorElementsVT(*DAG.getContext());
17465
17466 SmallVector<SDValue> Chains;
17467 for (int I = 0; I < NumIns; I++) {
17468 SDValue Ptr = DAG.getNode(
17469 ISD::ADD, DL, StackPtr.getValueType(), StackPtr,
17470 DAG.getConstant(I * 16 / NumIns, DL, StackPtr.getValueType()));
17472 DAG.getMachineFunction(), SPFI, I * 16 / NumIns);
17473 SDValue Ch = DAG.getTruncStore(DAG.getEntryNode(), DL, N->getOperand(I),
17474 Ptr, MPI, StoreVT, Align(4));
17475 Chains.push_back(Ch);
17476 }
17477
17478 SDValue Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
17479 MachinePointerInfo MPI =
17481 return DAG.getLoad(VT, DL, Chain, StackPtr, MPI, Align(4));
17482}
17483
17484// Take a MVEEXT(load x) and split that into (extload x, extload x+8)
17486 SelectionDAG &DAG) {
17487 SDValue N0 = N->getOperand(0);
17489 if (!LD || !LD->isSimple() || !N0.hasOneUse() || LD->isIndexed())
17490 return SDValue();
17491
17492 EVT FromVT = LD->getMemoryVT();
17493 EVT ToVT = N->getValueType(0);
17494 if (!ToVT.isVector())
17495 return SDValue();
17496 assert(FromVT.getVectorNumElements() == ToVT.getVectorNumElements() * 2);
17497 EVT ToEltVT = ToVT.getVectorElementType();
17498 EVT FromEltVT = FromVT.getVectorElementType();
17499
17500 unsigned NumElements = 0;
17501 if (ToEltVT == MVT::i32 && (FromEltVT == MVT::i16 || FromEltVT == MVT::i8))
17502 NumElements = 4;
17503 if (ToEltVT == MVT::i16 && FromEltVT == MVT::i8)
17504 NumElements = 8;
17505 assert(NumElements != 0);
17506
17508 N->getOpcode() == ARMISD::MVESEXT ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
17509 if (LD->getExtensionType() != ISD::NON_EXTLOAD &&
17510 LD->getExtensionType() != ISD::EXTLOAD &&
17511 LD->getExtensionType() != NewExtType)
17512 return SDValue();
17513
17514 LLVMContext &C = *DAG.getContext();
17515 SDLoc DL(LD);
17516 // Details about the old load
17517 SDValue Ch = LD->getChain();
17518 SDValue BasePtr = LD->getBasePtr();
17519 Align Alignment = LD->getOriginalAlign();
17520 MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
17521 AAMDNodes AAInfo = LD->getAAInfo();
17522
17523 SDValue Offset = DAG.getUNDEF(BasePtr.getValueType());
17525 C, EVT::getIntegerVT(C, FromEltVT.getScalarSizeInBits()), NumElements);
17527 C, EVT::getIntegerVT(C, ToEltVT.getScalarSizeInBits()), NumElements);
17528
17531 for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {
17532 unsigned NewOffset = (i * NewFromVT.getSizeInBits()) / 8;
17533 SDValue NewPtr =
17535
17538 LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT,
17539 Alignment, MMOFlags, AAInfo);
17540 Loads.push_back(NewLoad);
17541 Chains.push_back(SDValue(NewLoad.getNode(), 1));
17542 }
17543
17546 return DAG.getMergeValues(Loads, DL);
17547}
17548
17549// Perform combines for MVEEXT. If it has not be optimized to anything better
17550// before lowering, it gets converted to stack store and extloads performing the
17551// extend whilst still keeping the same lane ordering.
17554 SelectionDAG &DAG = DCI.DAG;
17555 EVT VT = N->getValueType(0);
17556 SDLoc DL(N);
17557 assert(N->getNumValues() == 2 && "Expected MVEEXT with 2 elements");
17558 assert((VT == MVT::v4i32 || VT == MVT::v8i16) && "Unexpected MVEEXT type");
17559
17560 EVT ExtVT = N->getOperand(0).getValueType().getHalfNumVectorElementsVT(
17561 *DAG.getContext());
17562 auto Extend = [&](SDValue V) {
17564 return N->getOpcode() == ARMISD::MVESEXT
17566 DAG.getValueType(ExtVT))
17567 : DAG.getZeroExtendInReg(VVT, DL, ExtVT);
17568 };
17569
17570 // MVEEXT(VDUP) -> SIGN_EXTEND_INREG(VDUP)
17571 if (N->getOperand(0).getOpcode() == ARMISD::VDUP) {
17572 SDValue Ext = Extend(N->getOperand(0));
17573 return DAG.getMergeValues({Ext, Ext}, DL);
17574 }
17575
17576 // MVEEXT(shuffle) -> SIGN_EXTEND_INREG/ZERO_EXTEND_INREG
17577 if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(N->getOperand(0))) {
17578 ArrayRef<int> Mask = SVN->getMask();
17579 assert(Mask.size() == 2 * VT.getVectorNumElements());
17580 assert(Mask.size() == SVN->getValueType(0).getVectorNumElements());
17581 unsigned Rev = VT == MVT::v4i32 ? ARMISD::VREV32 : ARMISD::VREV16;
17582 SDValue Op0 = SVN->getOperand(0);
17583 SDValue Op1 = SVN->getOperand(1);
17584
17585 auto CheckInregMask = [&](int Start, int Offset) {
17586 for (int Idx = 0, E = VT.getVectorNumElements(); Idx < E; ++Idx)
17587 if (Mask[Start + Idx] >= 0 && Mask[Start + Idx] != Idx * 2 + Offset)
17588 return false;
17589 return true;
17590 };
17591 SDValue V0 = SDValue(N, 0);
17592 SDValue V1 = SDValue(N, 1);
17593 if (CheckInregMask(0, 0))
17594 V0 = Extend(Op0);
17595 else if (CheckInregMask(0, 1))
17596 V0 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op0));
17597 else if (CheckInregMask(0, Mask.size()))
17598 V0 = Extend(Op1);
17599 else if (CheckInregMask(0, Mask.size() + 1))
17600 V0 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op1));
17601
17602 if (CheckInregMask(VT.getVectorNumElements(), Mask.size()))
17603 V1 = Extend(Op1);
17604 else if (CheckInregMask(VT.getVectorNumElements(), Mask.size() + 1))
17605 V1 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op1));
17606 else if (CheckInregMask(VT.getVectorNumElements(), 0))
17607 V1 = Extend(Op0);
17608 else if (CheckInregMask(VT.getVectorNumElements(), 1))
17609 V1 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op0));
17610
17611 if (V0.getNode() != N || V1.getNode() != N)
17612 return DAG.getMergeValues({V0, V1}, DL);
17613 }
17614
17615 // MVEEXT(load) -> extload, extload
17616 if (N->getOperand(0)->getOpcode() == ISD::LOAD)
17618 return L;
17619
17620 if (!DCI.isAfterLegalizeDAG())
17621 return SDValue();
17622
17623 // Lower to a stack store and reload:
17624 // VSTRW.32 a, stack; VLDRH.32 stack; VLDRH.32 stack+8;
17625 SDValue StackPtr = DAG.CreateStackTemporary(TypeSize::Fixed(16), Align(4));
17626 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
17627 int NumOuts = N->getNumValues();
17628 assert((NumOuts == 2 || NumOuts == 4) &&
17629 "Expected 2 or 4 outputs to an MVEEXT");
17630 EVT LoadVT = N->getOperand(0).getValueType().getHalfNumVectorElementsVT(
17631 *DAG.getContext());
17632 if (N->getNumOperands() == 4)
17633 LoadVT = LoadVT.getHalfNumVectorElementsVT(*DAG.getContext());
17634
17635 MachinePointerInfo MPI =
17637 SDValue Chain = DAG.getStore(DAG.getEntryNode(), DL, N->getOperand(0),
17638 StackPtr, MPI, Align(4));
17639
17641 for (int I = 0; I < NumOuts; I++) {
17642 SDValue Ptr = DAG.getNode(
17643 ISD::ADD, DL, StackPtr.getValueType(), StackPtr,
17644 DAG.getConstant(I * 16 / NumOuts, DL, StackPtr.getValueType()));
17646 DAG.getMachineFunction(), SPFI, I * 16 / NumOuts);
17647 SDValue Load = DAG.getExtLoad(
17648 N->getOpcode() == ARMISD::MVESEXT ? ISD::SEXTLOAD : ISD::ZEXTLOAD, DL,
17649 VT, Chain, Ptr, MPI, LoadVT, Align(4));
17650 Loads.push_back(Load);
17651 }
17652
17653 return DAG.getMergeValues(Loads, DL);
17654}
17655
17657 DAGCombinerInfo &DCI) const {
17658 switch (N->getOpcode()) {
17659 default: break;
17660 case ISD::SELECT_CC:
17661 case ISD::SELECT: return PerformSELECTCombine(N, DCI, Subtarget);
17662 case ISD::VSELECT: return PerformVSELECTCombine(N, DCI, Subtarget);
17663 case ISD::ABS: return PerformABSCombine(N, DCI, Subtarget);
17664 case ARMISD::ADDE: return PerformADDECombine(N, DCI, Subtarget);
17665 case ARMISD::UMLAL: return PerformUMLALCombine(N, DCI.DAG, Subtarget);
17666 case ISD::ADD: return PerformADDCombine(N, DCI, Subtarget);
17667 case ISD::SUB: return PerformSUBCombine(N, DCI, Subtarget);
17668 case ISD::MUL: return PerformMULCombine(N, DCI, Subtarget);
17669 case ISD::OR: return PerformORCombine(N, DCI, Subtarget);
17670 case ISD::XOR: return PerformXORCombine(N, DCI, Subtarget);
17671 case ISD::AND: return PerformANDCombine(N, DCI, Subtarget);
17672 case ISD::BRCOND:
17673 case ISD::BR_CC: return PerformHWLoopCombine(N, DCI, Subtarget);
17674 case ARMISD::ADDC:
17675 case ARMISD::SUBC: return PerformAddcSubcCombine(N, DCI, Subtarget);
17676 case ARMISD::SUBE: return PerformAddeSubeCombine(N, DCI, Subtarget);
17677 case ARMISD::BFI: return PerformBFICombine(N, DCI.DAG);
17678 case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI, Subtarget);
17679 case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG);
17681 case ARMISD::VMOVrh: return PerformVMOVrhCombine(N, DCI.DAG);
17682 case ISD::STORE: return PerformSTORECombine(N, DCI, Subtarget);
17683 case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI, Subtarget);
17686 return PerformExtractEltCombine(N, DCI, Subtarget);
17689 case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI, Subtarget);
17690 case ARMISD::VDUP: return PerformVDUPCombine(N, DCI.DAG, Subtarget);
17691 case ISD::FP_TO_SINT:
17692 case ISD::FP_TO_UINT:
17693 return PerformVCVTCombine(N, DCI.DAG, Subtarget);
17694 case ISD::FDIV:
17695 return PerformVDIVCombine(N, DCI.DAG, Subtarget);
17697 return PerformIntrinsicCombine(N, DCI);
17698 case ISD::SHL:
17699 case ISD::SRA:
17700 case ISD::SRL:
17701 return PerformShiftCombine(N, DCI, Subtarget);
17702 case ISD::SIGN_EXTEND:
17703 case ISD::ZERO_EXTEND:
17704 case ISD::ANY_EXTEND:
17705 return PerformExtendCombine(N, DCI.DAG, Subtarget);
17706 case ISD::FP_EXTEND:
17707 return PerformFPExtendCombine(N, DCI.DAG, Subtarget);
17708 case ISD::SMIN:
17709 case ISD::UMIN:
17710 case ISD::SMAX:
17711 case ISD::UMAX:
17712 return PerformMinMaxCombine(N, DCI.DAG, Subtarget);
17713 case ARMISD::CMOV: return PerformCMOVCombine(N, DCI.DAG);
17714 case ARMISD::BRCOND: return PerformBRCONDCombine(N, DCI.DAG);
17715 case ISD::LOAD: return PerformLOADCombine(N, DCI);
17716 case ARMISD::VLD1DUP:
17717 case ARMISD::VLD2DUP:
17718 case ARMISD::VLD3DUP:
17719 case ARMISD::VLD4DUP:
17720 return PerformVLDCombine(N, DCI);
17723 case ISD::BITCAST:
17724 return PerformBITCASTCombine(N, DCI, Subtarget);
17728 return PerformVECTOR_REG_CASTCombine(N, DCI.DAG, Subtarget);
17729 case ARMISD::MVETRUNC:
17730 return PerformMVETruncCombine(N, DCI);
17731 case ARMISD::MVESEXT:
17732 case ARMISD::MVEZEXT:
17733 return PerformMVEExtCombine(N, DCI);
17734 case ARMISD::VCMP:
17735 return PerformVCMPCombine(N, DCI.DAG, Subtarget);
17736 case ISD::VECREDUCE_ADD:
17737 return PerformVECREDUCE_ADDCombine(N, DCI.DAG, Subtarget);
17738 case ARMISD::VMOVN:
17739 return PerformVMOVNCombine(N, DCI);
17740 case ARMISD::VQMOVNs:
17741 case ARMISD::VQMOVNu:
17742 return PerformVQMOVNCombine(N, DCI);
17743 case ARMISD::ASRL:
17744 case ARMISD::LSRL:
17745 case ARMISD::LSLL:
17746 return PerformLongShiftCombine(N, DCI.DAG);
17747 case ARMISD::SMULWB: {
17748 unsigned BitWidth = N->getValueType(0).getSizeInBits();
17750 if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))
17751 return SDValue();
17752 break;
17753 }
17754 case ARMISD::SMULWT: {
17755 unsigned BitWidth = N->getValueType(0).getSizeInBits();
17757 if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))
17758 return SDValue();
17759 break;
17760 }
17761 case ARMISD::SMLALBB:
17762 case ARMISD::QADD16b:
17763 case ARMISD::QSUB16b:
17764 case ARMISD::UQADD16b:
17765 case ARMISD::UQSUB16b: {
17766 unsigned BitWidth = N->getValueType(0).getSizeInBits();
17768 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
17769 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)))
17770 return SDValue();
17771 break;
17772 }
17773 case ARMISD::SMLALBT: {
17774 unsigned LowWidth = N->getOperand(0).getValueType().getSizeInBits();
17776 unsigned HighWidth = N->getOperand(1).getValueType().getSizeInBits();
17778 if ((SimplifyDemandedBits(N->getOperand(0), LowMask, DCI)) ||
17779 (SimplifyDemandedBits(N->getOperand(1), HighMask, DCI)))
17780 return SDValue();
17781 break;
17782 }
17783 case ARMISD::SMLALTB: {
17784 unsigned HighWidth = N->getOperand(0).getValueType().getSizeInBits();
17786 unsigned LowWidth = N->getOperand(1).getValueType().getSizeInBits();
17788 if ((SimplifyDemandedBits(N->getOperand(0), HighMask, DCI)) ||
17789 (SimplifyDemandedBits(N->getOperand(1), LowMask, DCI)))
17790 return SDValue();
17791 break;
17792 }
17793 case ARMISD::SMLALTT: {
17794 unsigned BitWidth = N->getValueType(0).getSizeInBits();
17796 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
17797 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)))
17798 return SDValue();
17799 break;
17800 }
17801 case ARMISD::QADD8b:
17802 case ARMISD::QSUB8b:
17803 case ARMISD::UQADD8b:
17804 case ARMISD::UQSUB8b: {
17805 unsigned BitWidth = N->getValueType(0).getSizeInBits();
17807 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
17808 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)))
17809 return SDValue();
17810 break;
17811 }
17814 switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
17815 case Intrinsic::arm_neon_vld1:
17816 case Intrinsic::arm_neon_vld1x2:
17817 case Intrinsic::arm_neon_vld1x3:
17818 case Intrinsic::arm_neon_vld1x4:
17819 case Intrinsic::arm_neon_vld2:
17820 case Intrinsic::arm_neon_vld3:
17821 case Intrinsic::arm_neon_vld4:
17822 case Intrinsic::arm_neon_vld2lane:
17823 case Intrinsic::arm_neon_vld3lane:
17824 case Intrinsic::arm_neon_vld4lane:
17825 case Intrinsic::arm_neon_vld2dup:
17826 case Intrinsic::arm_neon_vld3dup:
17827 case Intrinsic::arm_neon_vld4dup:
17828 case Intrinsic::arm_neon_vst1:
17829 case Intrinsic::arm_neon_vst1x2:
17830 case Intrinsic::arm_neon_vst1x3:
17831 case Intrinsic::arm_neon_vst1x4:
17832 case Intrinsic::arm_neon_vst2:
17833 case Intrinsic::arm_neon_vst3:
17834 case Intrinsic::arm_neon_vst4:
17835 case Intrinsic::arm_neon_vst2lane:
17836 case Intrinsic::arm_neon_vst3lane:
17837 case Intrinsic::arm_neon_vst4lane:
17838 return PerformVLDCombine(N, DCI);
17839 case Intrinsic::arm_mve_vld2q:
17840 case Intrinsic::arm_mve_vld4q:
17841 case Intrinsic::arm_mve_vst2q:
17842 case Intrinsic::arm_mve_vst4q:
17843 return PerformMVEVLDCombine(N, DCI);
17844 default: break;
17845 }
17846 break;
17847 }
17848 return SDValue();
17849}
17850
17852 EVT VT) const {
17853 return (VT == MVT::f32) && (Opc == ISD::LOAD || Opc == ISD::STORE);
17854}
17855
17857 Align Alignment,
17859 bool *Fast) const {
17860 // Depends what it gets converted into if the type is weird.
17861 if (!VT.isSimple())
17862 return false;
17863
17864 // The AllowsUnaligned flag models the SCTLR.A setting in ARM cpus
17865 bool AllowsUnaligned = Subtarget->allowsUnalignedMem();
17866 auto Ty = VT.getSimpleVT().SimpleTy;
17867
17868 if (Ty == MVT::i8 || Ty == MVT::i16 || Ty == MVT::i32) {
17869 // Unaligned access can use (for example) LRDB, LRDH, LDR
17870 if (AllowsUnaligned) {
17871 if (Fast)
17872 *Fast = Subtarget->hasV7Ops();
17873 return true;
17874 }
17875 }
17876
17877 if (Ty == MVT::f64 || Ty == MVT::v2f64) {
17878 // For any little-endian targets with neon, we can support unaligned ld/st
17879 // of D and Q (e.g. {D0,D1}) registers by using vld1.i8/vst1.i8.
17880 // A big-endian target may also explicitly support unaligned accesses
17881 if (Subtarget->hasNEON() && (AllowsUnaligned || Subtarget->isLittle())) {
17882 if (Fast)
17883 *Fast = true;
17884 return true;
17885 }
17886 }
17887
17888 if (!Subtarget->hasMVEIntegerOps())
17889 return false;
17890
17891 // These are for predicates
17892 if ((Ty == MVT::v16i1 || Ty == MVT::v8i1 || Ty == MVT::v4i1)) {
17893 if (Fast)
17894 *Fast = true;
17895 return true;
17896 }
17897
17898 // These are for truncated stores/narrowing loads. They are fine so long as
17899 // the alignment is at least the size of the item being loaded
17900 if ((Ty == MVT::v4i8 || Ty == MVT::v8i8 || Ty == MVT::v4i16) &&
17901 Alignment >= VT.getScalarSizeInBits() / 8) {
17902 if (Fast)
17903 *Fast = true;
17904 return true;
17905 }
17906
17907 // In little-endian MVE, the store instructions VSTRB.U8, VSTRH.U16 and
17908 // VSTRW.U32 all store the vector register in exactly the same format, and
17909 // differ only in the range of their immediate offset field and the required
17910 // alignment. So there is always a store that can be used, regardless of
17911 // actual type.
17912 //
17913 // For big endian, that is not the case. But can still emit a (VSTRB.U8;
17914 // VREV64.8) pair and get the same effect. This will likely be better than
17915 // aligning the vector through the stack.
17916 if (Ty == MVT::v16i8 || Ty == MVT::v8i16 || Ty == MVT::v8f16 ||
17917 Ty == MVT::v4i32 || Ty == MVT::v4f32 || Ty == MVT::v2i64 ||
17918 Ty == MVT::v2f64) {
17919 if (Fast)
17920 *Fast = true;
17921 return true;
17922 }
17923
17924 return false;
17925}
17926
17927
17929 const MemOp &Op, const AttributeList &FuncAttributes) const {
17930 // See if we can use NEON instructions for this...
17931 if ((Op.isMemcpy() || Op.isZeroMemset()) && Subtarget->hasNEON() &&
17932 !FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) {
17933 bool Fast;
17934 if (Op.size() >= 16 &&
17935 (Op.isAligned(Align(16)) ||
17938 Fast))) {
17939 return MVT::v2f64;
17940 } else if (Op.size() >= 8 &&
17941 (Op.isAligned(Align(8)) ||
17944 Fast))) {
17945 return MVT::f64;
17946 }
17947 }
17948
17949 // Let the target-independent logic figure it out.
17950 return MVT::Other;
17951}
17952
17953// 64-bit integers are split into their high and low parts and held in two
17954// different registers, so the trunc is free since the low register can just
17955// be used.
17957 if (!SrcTy->isIntegerTy() || !DstTy->isIntegerTy())
17958 return false;
17959 unsigned SrcBits = SrcTy->getPrimitiveSizeInBits();
17960 unsigned DestBits = DstTy->getPrimitiveSizeInBits();
17961 return (SrcBits == 64 && DestBits == 32);
17962}
17963
17965 if (SrcVT.isVector() || DstVT.isVector() || !SrcVT.isInteger() ||
17966 !DstVT.isInteger())
17967 return false;
17968 unsigned SrcBits = SrcVT.getSizeInBits();
17969 unsigned DestBits = DstVT.getSizeInBits();
17970 return (SrcBits == 64 && DestBits == 32);
17971}
17972
17974 if (Val.getOpcode() != ISD::LOAD)
17975 return false;
17976
17977 EVT VT1 = Val.getValueType();
17978 if (!VT1.isSimple() || !VT1.isInteger() ||
17979 !VT2.isSimple() || !VT2.isInteger())
17980 return false;
17981
17982 switch (VT1.getSimpleVT().SimpleTy) {
17983 default: break;
17984 case MVT::i1:
17985 case MVT::i8:
17986 case MVT::i16:
17987 // 8-bit and 16-bit loads implicitly zero-extend to 32-bits.
17988 return true;
17989 }
17990
17991 return false;
17992}
17993
17995 if (!VT.isSimple())
17996 return false;
17997
17998 // There are quite a few FP16 instructions (e.g. VNMLA, VNMLS, etc.) that
17999 // negate values directly (fneg is free). So, we don't want to let the DAG
18000 // combiner rewrite fneg into xors and some other instructions. For f16 and
18001 // FullFP16 argument passing, some bitcast nodes may be introduced,
18002 // triggering this DAG combine rewrite, so we are avoiding that with this.
18003 switch (VT.getSimpleVT().SimpleTy) {
18004 default: break;
18005 case MVT::f16:
18006 return Subtarget->hasFullFP16();
18007 }
18008
18009 return false;
18010}
18011
18012/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
18013/// of the vector elements.
18015 auto areExtDoubled = [](Instruction *Ext) {
18016 return Ext->getType()->getScalarSizeInBits() ==
18017 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits();
18018 };
18019
18020 if (!match(Ext1, m_ZExtOrSExt(m_Value())) ||
18024 return false;
18025
18026 return true;
18027}
18028
18029/// Check if sinking \p I's operands to I's basic block is profitable, because
18030/// the operands can be folded into a target instruction, e.g.
18031/// sext/zext can be folded into vsubl.
18033 SmallVectorImpl<Use *> &Ops) const {
18034 if (!I->getType()->isVectorTy())
18035 return false;
18036
18037 if (Subtarget->hasNEON()) {
18038 switch (I->getOpcode()) {
18039 case Instruction::Sub:
18040 case Instruction::Add: {
18041 if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
18042 return false;
18043 Ops.push_back(&I->getOperandUse(0));
18044 Ops.push_back(&I->getOperandUse(1));
18045 return true;
18046 }
18047 default:
18048 return false;
18049 }
18050 }
18051
18052 if (!Subtarget->hasMVEIntegerOps())
18053 return false;
18054
18055 auto IsFMSMul = [&](Instruction *I) {
18056 if (!I->hasOneUse())
18057 return false;
18058 auto *Sub = cast<Instruction>(*I->users().begin());
18059 return Sub->getOpcode() == Instruction::FSub && Sub->getOperand(1) == I;
18060 };
18061 auto IsFMS = [&](Instruction *I) {
18062 if (match(I->getOperand(0), m_FNeg(m_Value())) ||
18063 match(I->getOperand(1), m_FNeg(m_Value())))
18064 return true;
18065 return false;
18066 };
18067
18068 auto IsSinker = [&](Instruction *I, int Operand) {
18069 switch (I->getOpcode()) {
18070 case Instruction::Add:
18071 case Instruction::Mul:
18072 case Instruction::FAdd:
18073 case Instruction::ICmp:
18074 case Instruction::FCmp:
18075 return true;
18076 case Instruction::FMul:
18077 return !IsFMSMul(I);
18078 case Instruction::Sub:
18079 case Instruction::FSub:
18080 case Instruction::Shl:
18081 case Instruction::LShr:
18082 case Instruction::AShr:
18083 return Operand == 1;
18084 case Instruction::Call:
18085 if (auto *II = dyn_cast<IntrinsicInst>(I)) {
18086 switch (II->getIntrinsicID()) {
18087 case Intrinsic::fma:
18088 return !IsFMS(I);
18089 case Intrinsic::arm_mve_add_predicated:
18090 case Intrinsic::arm_mve_mul_predicated:
18091 case Intrinsic::arm_mve_qadd_predicated:
18092 case Intrinsic::arm_mve_hadd_predicated:
18093 case Intrinsic::arm_mve_vqdmull_predicated:
18094 case Intrinsic::arm_mve_qdmulh_predicated:
18095 case Intrinsic::arm_mve_qrdmulh_predicated:
18096 case Intrinsic::arm_mve_fma_predicated:
18097 return true;
18098 case Intrinsic::arm_mve_sub_predicated:
18099 case Intrinsic::arm_mve_qsub_predicated:
18100 case Intrinsic::arm_mve_hsub_predicated:
18101 return Operand == 1;
18102 default:
18103 return false;
18104 }
18105 }
18106 return false;
18107 default:
18108 return false;
18109 }
18110 };
18111
18112 for (auto OpIdx : enumerate(I->operands())) {
18113 Instruction *Op = dyn_cast<Instruction>(OpIdx.value().get());
18114 // Make sure we are not already sinking this operand
18115 if (!Op || any_of(Ops, [&](Use *U) { return U->get() == Op; }))
18116 continue;
18117
18118 Instruction *Shuffle = Op;
18119 if (Shuffle->getOpcode() == Instruction::BitCast)
18120 Shuffle = dyn_cast<Instruction>(Shuffle->getOperand(0));
18121 // We are looking for a splat that can be sunk.
18122 if (!Shuffle ||
18123 !match(Shuffle, m_Shuffle(
18125 m_Undef(), m_ZeroMask())))
18126 continue;
18127 if (!IsSinker(I, OpIdx.index()))
18128 continue;
18129
18130 // All uses of the shuffle should be sunk to avoid duplicating it across gpr
18131 // and vector registers
18132 for (Use &U : Op->uses()) {
18133 Instruction *Insn = cast<Instruction>(U.getUser());
18134 if (!IsSinker(Insn, U.getOperandNo()))
18135 return false;
18136 }
18137
18138 Ops.push_back(&Shuffle->getOperandUse(0));
18139 if (Shuffle != Op)
18140 Ops.push_back(&Op->getOperandUse(0));
18141 Ops.push_back(&OpIdx.value());
18142 }
18143 return true;
18144}
18145
18147 if (!Subtarget->hasMVEIntegerOps())
18148 return nullptr;
18149 Type *SVIType = SVI->getType();
18150 Type *ScalarType = SVIType->getScalarType();
18151
18152 if (ScalarType->isFloatTy())
18153 return Type::getInt32Ty(SVIType->getContext());
18154 if (ScalarType->isHalfTy())
18155 return Type::getInt16Ty(SVIType->getContext());
18156 return nullptr;
18157}
18158
18160 EVT VT = ExtVal.getValueType();
18161
18162 if (!isTypeLegal(VT))
18163 return false;
18164
18165 if (auto *Ld = dyn_cast<MaskedLoadSDNode>(ExtVal.getOperand(0))) {
18166 if (Ld->isExpandingLoad())
18167 return false;
18168 }
18169
18170 if (Subtarget->hasMVEIntegerOps())
18171 return true;
18172
18173 // Don't create a loadext if we can fold the extension into a wide/long
18174 // instruction.
18175 // If there's more than one user instruction, the loadext is desirable no
18176 // matter what. There can be two uses by the same instruction.
18177 if (ExtVal->use_empty() ||
18178 !ExtVal->use_begin()->isOnlyUserOf(ExtVal.getNode()))
18179 return true;
18180
18181 SDNode *U = *ExtVal->use_begin();
18182 if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB ||
18183 U->getOpcode() == ISD::SHL || U->getOpcode() == ARMISD::VSHLIMM))
18184 return false;
18185
18186 return true;
18187}
18188
18190 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
18191 return false;
18192
18194 return false;
18195
18196 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
18197
18198 // Assuming the caller doesn't have a zeroext or signext return parameter,
18199 // truncation all the way down to i1 is valid.
18200 return true;
18201}
18202
18204 const AddrMode &AM,
18205 Type *Ty,
18206 unsigned AS) const {
18207 if (isLegalAddressingMode(DL, AM, Ty, AS)) {
18208 if (Subtarget->hasFPAO())
18209 return AM.Scale < 0 ? 1 : 0; // positive offsets execute faster
18210 return 0;
18211 }
18212 return -1;
18213}
18214
18215/// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
18216/// than a pair of fmul and fadd instructions. fmuladd intrinsics will be
18217/// expanded to FMAs when this method returns true, otherwise fmuladd is
18218/// expanded to fmul + fadd.
18219///
18220/// ARM supports both fused and unfused multiply-add operations; we already
18221/// lower a pair of fmul and fadd to the latter so it's not clear that there
18222/// would be a gain or that the gain would be worthwhile enough to risk
18223/// correctness bugs.
18224///
18225/// For MVE, we set this to true as it helps simplify the need for some
18226/// patterns (and we don't have the non-fused floating point instruction).
18227bool ARMTargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
18228 EVT VT) const {
18229 if (!VT.isSimple())
18230 return false;
18231
18232 switch (VT.getSimpleVT().SimpleTy) {
18233 case MVT::v4f32:
18234 case MVT::v8f16:
18235 return Subtarget->hasMVEFloatOps();
18236 case MVT::f16:
18237 return Subtarget->useFPVFMx16();
18238 case MVT::f32:
18239 return Subtarget->useFPVFMx();
18240 case MVT::f64:
18241 return Subtarget->useFPVFMx64();
18242 default:
18243 break;
18244 }
18245
18246 return false;
18247}
18248
18249static bool isLegalT1AddressImmediate(int64_t V, EVT VT) {
18250 if (V < 0)
18251 return false;
18252
18253 unsigned Scale = 1;
18254 switch (VT.getSimpleVT().SimpleTy) {
18255 case MVT::i1:
18256 case MVT::i8:
18257 // Scale == 1;
18258 break;
18259 case MVT::i16:
18260 // Scale == 2;
18261 Scale = 2;
18262 break;
18263 default:
18264 // On thumb1 we load most things (i32, i64, floats, etc) with a LDR
18265 // Scale == 4;
18266 Scale = 4;
18267 break;
18268 }
18269
18270 if ((V & (Scale - 1)) != 0)
18271 return false;
18272 return isUInt<5>(V / Scale);
18273}
18274
18275static bool isLegalT2AddressImmediate(int64_t V, EVT VT,
18276 const ARMSubtarget *Subtarget) {
18277 if (!VT.isInteger() && !VT.isFloatingPoint())
18278 return false;
18279 if (VT.isVector() && Subtarget->hasNEON())
18280 return false;
18281 if (VT.isVector() && VT.isFloatingPoint() && Subtarget->hasMVEIntegerOps() &&
18282 !Subtarget->hasMVEFloatOps())
18283 return false;
18284
18285 bool IsNeg = false;
18286 if (V < 0) {
18287 IsNeg = true;
18288 V = -V;
18289 }
18290
18291 unsigned NumBytes = std::max((unsigned)VT.getSizeInBits() / 8, 1U);
18292
18293 // MVE: size * imm7
18294 if (VT.isVector() && Subtarget->hasMVEIntegerOps()) {
18295 switch (VT.getSimpleVT().getVectorElementType().SimpleTy) {
18296 case MVT::i32:
18297 case MVT::f32:
18298 return isShiftedUInt<7,2>(V);
18299 case MVT::i16:
18300 case MVT::f16:
18301 return isShiftedUInt<7,1>(V);
18302 case MVT::i8:
18303 return isUInt<7>(V);
18304 default:
18305 return false;
18306 }
18307 }
18308
18309 // half VLDR: 2 * imm8
18310 if (VT.isFloatingPoint() && NumBytes == 2 && Subtarget->hasFPRegs16())
18311 return isShiftedUInt<8, 1>(V);
18312 // VLDR and LDRD: 4 * imm8
18313 if ((VT.isFloatingPoint() && Subtarget->hasVFP2Base()) || NumBytes == 8)
18314 return isShiftedUInt<8, 2>(V);
18315
18316 if (NumBytes == 1 || NumBytes == 2 || NumBytes == 4) {
18317 // + imm12 or - imm8
18318 if (IsNeg)
18319 return isUInt<8>(V);
18320 return isUInt<12>(V);
18321 }
18322
18323 return false;
18324}
18325
18326/// isLegalAddressImmediate - Return true if the integer value can be used
18327/// as the offset of the target addressing mode for load / store of the
18328/// given type.
18329static bool isLegalAddressImmediate(int64_t V, EVT VT,
18330 const ARMSubtarget *Subtarget) {
18331 if (V == 0)
18332 return true;
18333
18334 if (!VT.isSimple())
18335 return false;
18336
18337 if (Subtarget->isThumb1Only())
18338 return isLegalT1AddressImmediate(V, VT);
18339 else if (Subtarget->isThumb2())
18340 return isLegalT2AddressImmediate(V, VT, Subtarget);
18341
18342 // ARM mode.
18343 if (V < 0)
18344 V = - V;
18345 switch (VT.getSimpleVT().SimpleTy) {
18346 default: return false;
18347 case MVT::i1:
18348 case MVT::i8:
18349 case MVT::i32:
18350 // +- imm12
18351 return isUInt<12>(V);
18352 case MVT::i16:
18353 // +- imm8
18354 return isUInt<8>(V);
18355 case MVT::f32:
18356 case MVT::f64:
18357 if (!Subtarget->hasVFP2Base()) // FIXME: NEON?
18358 return false;
18359 return isShiftedUInt<8, 2>(V);
18360 }
18361}
18362
18364 EVT VT) const {
18365 int Scale = AM.Scale;
18366 if (Scale < 0)
18367 return false;
18368
18369 switch (VT.getSimpleVT().SimpleTy) {
18370 default: return false;
18371 case MVT::i1:
18372 case MVT::i8:
18373 case MVT::i16:
18374 case MVT::i32:
18375 if (Scale == 1)
18376 return true;
18377 // r + r << imm
18378 Scale = Scale & ~1;
18379 return Scale == 2 || Scale == 4 || Scale == 8;
18380 case MVT::i64:
18381 // FIXME: What are we trying to model here? ldrd doesn't have an r + r
18382 // version in Thumb mode.
18383 // r + r
18384 if (Scale == 1)
18385 return true;
18386 // r * 2 (this can be lowered to r + r).
18387 if (!AM.HasBaseReg && Scale == 2)
18388 return true;
18389 return false;
18390 case MVT::isVoid:
18391 // Note, we allow "void" uses (basically, uses that aren't loads or
18392 // stores), because arm allows folding a scale into many arithmetic
18393 // operations. This should be made more precise and revisited later.
18394
18395 // Allow r << imm, but the imm has to be a multiple of two.
18396 if (Scale & 1) return false;
18397 return isPowerOf2_32(Scale);
18398 }
18399}
18400
18402 EVT VT) const {
18403 const int Scale = AM.Scale;
18404
18405 // Negative scales are not supported in Thumb1.
18406 if (Scale < 0)
18407 return false;
18408
18409 // Thumb1 addressing modes do not support register scaling excepting the
18410 // following cases:
18411 // 1. Scale == 1 means no scaling.
18412 // 2. Scale == 2 this can be lowered to r + r if there is no base register.
18413 return (Scale == 1) || (!AM.HasBaseReg && Scale == 2);
18414}
18415
18416/// isLegalAddressingMode - Return true if the addressing mode represented
18417/// by AM is legal for this target, for a load/store of the specified type.
18419 const AddrMode &AM, Type *Ty,
18420 unsigned AS, Instruction *I) const {
18421 EVT VT = getValueType(DL, Ty, true);
18422 if (!isLegalAddressImmediate(AM.BaseOffs, VT, Subtarget))
18423 return false;
18424
18425 // Can never fold addr of global into load/store.
18426 if (AM.BaseGV)
18427 return false;
18428
18429 switch (AM.Scale) {
18430 case 0: // no scale reg, must be "r+i" or "r", or "i".
18431 break;
18432 default:
18433 // ARM doesn't support any R+R*scale+imm addr modes.
18434 if (AM.BaseOffs)
18435 return false;
18436
18437 if (!VT.isSimple())
18438 return false;
18439
18440 if (Subtarget->isThumb1Only())
18441 return isLegalT1ScaledAddressingMode(AM, VT);
18442
18443 if (Subtarget->isThumb2())
18444 return isLegalT2ScaledAddressingMode(AM, VT);
18445
18446 int Scale = AM.Scale;
18447 switch (VT.getSimpleVT().SimpleTy) {
18448 default: return false;
18449 case MVT::i1:
18450 case MVT::i8:
18451 case MVT::i32:
18452 if (Scale < 0) Scale = -Scale;
18453 if (Scale == 1)
18454 return true;
18455 // r + r << imm
18456 return isPowerOf2_32(Scale & ~1);
18457 case MVT::i16:
18458 case MVT::i64:
18459 // r +/- r
18460 if (Scale == 1 || (AM.HasBaseReg && Scale == -1))
18461 return true;
18462 // r * 2 (this can be lowered to r + r).
18463 if (!AM.HasBaseReg && Scale == 2)
18464 return true;
18465 return false;
18466
18467 case MVT::isVoid:
18468 // Note, we allow "void" uses (basically, uses that aren't loads or
18469 // stores), because arm allows folding a scale into many arithmetic
18470 // operations. This should be made more precise and revisited later.
18471
18472 // Allow r << imm, but the imm has to be a multiple of two.
18473 if (Scale & 1) return false;
18474 return isPowerOf2_32(Scale);
18475 }
18476 }
18477 return true;
18478}
18479
18480/// isLegalICmpImmediate - Return true if the specified immediate is legal
18481/// icmp immediate, that is the target has icmp instructions which can compare
18482/// a register against the immediate without having to materialize the
18483/// immediate into a register.
18485 // Thumb2 and ARM modes can use cmn for negative immediates.
18486 if (!Subtarget->isThumb())
18487 return ARM_AM::getSOImmVal((uint32_t)Imm) != -1 ||
18488 ARM_AM::getSOImmVal(-(uint32_t)Imm) != -1;
18489 if (Subtarget->isThumb2())
18490 return ARM_AM::getT2SOImmVal((uint32_t)Imm) != -1 ||
18491 ARM_AM::getT2SOImmVal(-(uint32_t)Imm) != -1;
18492 // Thumb1 doesn't have cmn, and only 8-bit immediates.
18493 return Imm >= 0 && Imm <= 255;
18494}
18495
18496/// isLegalAddImmediate - Return true if the specified immediate is a legal add
18497/// *or sub* immediate, that is the target has add or sub instructions which can
18498/// add a register with the immediate without having to materialize the
18499/// immediate into a register.
18501 // Same encoding for add/sub, just flip the sign.
18502 int64_t AbsImm = std::abs(Imm);
18503 if (!Subtarget->isThumb())
18504 return ARM_AM::getSOImmVal(AbsImm) != -1;
18505 if (Subtarget->isThumb2())
18506 return ARM_AM::getT2SOImmVal(AbsImm) != -1;
18507 // Thumb1 only has 8-bit unsigned immediate.
18508 return AbsImm >= 0 && AbsImm <= 255;
18509}
18510
18512 bool isSEXTLoad, SDValue &Base,
18513 SDValue &Offset, bool &isInc,
18514 SelectionDAG &DAG) {
18515 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
18516 return false;
18517
18518 if (VT == MVT::i16 || ((VT == MVT::i8 || VT == MVT::i1) && isSEXTLoad)) {
18519 // AddressingMode 3
18520 Base = Ptr->getOperand(0);
18522 int RHSC = (int)RHS->getZExtValue();
18523 if (RHSC < 0 && RHSC > -256) {
18524 assert(Ptr->getOpcode() == ISD::ADD);
18525 isInc = false;
18526 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
18527 return true;
18528 }
18529 }
18530 isInc = (Ptr->getOpcode() == ISD::ADD);
18531 Offset = Ptr->getOperand(1);
18532 return true;
18533 } else if (VT == MVT::i32 || VT == MVT::i8 || VT == MVT::i1) {
18534 // AddressingMode 2
18536 int RHSC = (int)RHS->getZExtValue();
18537 if (RHSC < 0 && RHSC > -0x1000) {
18538 assert(Ptr->getOpcode() == ISD::ADD);
18539 isInc = false;
18540 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
18541 Base = Ptr->getOperand(0);
18542 return true;
18543 }
18544 }
18545
18546 if (Ptr->getOpcode() == ISD::ADD) {
18547 isInc = true;
18550 if (ShOpcVal != ARM_AM::no_shift) {
18551 Base = Ptr->getOperand(1);
18552 Offset = Ptr->getOperand(0);
18553 } else {
18554 Base = Ptr->getOperand(0);
18555 Offset = Ptr->getOperand(1);
18556 }
18557 return true;
18558 }
18559
18560 isInc = (Ptr->getOpcode() == ISD::ADD);
18561 Base = Ptr->getOperand(0);
18562 Offset = Ptr->getOperand(1);
18563 return true;
18564 }
18565
18566 // FIXME: Use VLDM / VSTM to emulate indexed FP load / store.
18567 return false;
18568}
18569
18571 bool isSEXTLoad, SDValue &Base,
18572 SDValue &Offset, bool &isInc,
18573 SelectionDAG &DAG) {
18574 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
18575 return false;
18576
18577 Base = Ptr->getOperand(0);
18579 int RHSC = (int)RHS->getZExtValue();
18580 if (RHSC < 0 && RHSC > -0x100) { // 8 bits.
18581 assert(Ptr->getOpcode() == ISD::ADD);
18582 isInc = false;
18583 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
18584 return true;
18585 } else if (RHSC > 0 && RHSC < 0x100) { // 8 bit, no zero.
18586 isInc = Ptr->getOpcode() == ISD::ADD;
18587 Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0));
18588 return true;
18589 }
18590 }
18591
18592 return false;
18593}
18594
18595static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, Align Alignment,
18596 bool isSEXTLoad, bool IsMasked, bool isLE,
18598 bool &isInc, SelectionDAG &DAG) {
18599 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
18600 return false;
18601 if (!isa<ConstantSDNode>(Ptr->getOperand(1)))
18602 return false;
18603
18604 // We allow LE non-masked loads to change the type (for example use a vldrb.8
18605 // as opposed to a vldrw.32). This can allow extra addressing modes or
18606 // alignments for what is otherwise an equivalent instruction.
18607 bool CanChangeType = isLE && !IsMasked;
18608
18610 int RHSC = (int)RHS->getZExtValue();
18611
18612 auto IsInRange = [&](int RHSC, int Limit, int Scale) {
18613 if (RHSC < 0 && RHSC > -Limit * Scale && RHSC % Scale == 0) {
18614 assert(Ptr->getOpcode() == ISD::ADD);
18615 isInc = false;
18616 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
18617 return true;
18618 } else if (RHSC > 0 && RHSC < Limit * Scale && RHSC % Scale == 0) {
18619 isInc = Ptr->getOpcode() == ISD::ADD;
18620 Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0));
18621 return true;
18622 }
18623 return false;
18624 };
18625
18626 // Try to find a matching instruction based on s/zext, Alignment, Offset and
18627 // (in BE/masked) type.
18628 Base = Ptr->getOperand(0);
18629 if (VT == MVT::v4i16) {
18630 if (Alignment >= 2 && IsInRange(RHSC, 0x80, 2))
18631 return true;
18632 } else if (VT == MVT::v4i8 || VT == MVT::v8i8) {
18633 if (IsInRange(RHSC, 0x80, 1))
18634 return true;
18635 } else if (Alignment >= 4 &&
18636 (CanChangeType || VT == MVT::v4i32 || VT == MVT::v4f32) &&
18637 IsInRange(RHSC, 0x80, 4))
18638 return true;
18639 else if (Alignment >= 2 &&
18640 (CanChangeType || VT == MVT::v8i16 || VT == MVT::v8f16) &&
18641 IsInRange(RHSC, 0x80, 2))
18642 return true;
18643 else if ((CanChangeType || VT == MVT::v16i8) && IsInRange(RHSC, 0x80, 1))
18644 return true;
18645 return false;
18646}
18647
18648/// getPreIndexedAddressParts - returns true by value, base pointer and
18649/// offset pointer and addressing mode by reference if the node's address
18650/// can be legally represented as pre-indexed load / store address.
18651bool
18653 SDValue &Offset,
18655 SelectionDAG &DAG) const {
18656 if (Subtarget->isThumb1Only())
18657 return false;
18658
18659 EVT VT;
18660 SDValue Ptr;
18661 Align Alignment;
18662 bool isSEXTLoad = false;
18663 bool IsMasked = false;
18664 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
18665 Ptr = LD->getBasePtr();
18666 VT = LD->getMemoryVT();
18667 Alignment = LD->getAlign();
18668 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
18669 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
18670 Ptr = ST->getBasePtr();
18671 VT = ST->getMemoryVT();
18672 Alignment = ST->getAlign();
18673 } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) {
18674 Ptr = LD->getBasePtr();
18675 VT = LD->getMemoryVT();
18676 Alignment = LD->getAlign();
18677 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
18678 IsMasked = true;
18680 Ptr = ST->getBasePtr();
18681 VT = ST->getMemoryVT();
18682 Alignment = ST->getAlign();
18683 IsMasked = true;
18684 } else
18685 return false;
18686
18687 bool isInc;
18688 bool isLegal = false;
18689 if (VT.isVector())
18690 isLegal = Subtarget->hasMVEIntegerOps() &&
18692 Ptr.getNode(), VT, Alignment, isSEXTLoad, IsMasked,
18693 Subtarget->isLittle(), Base, Offset, isInc, DAG);
18694 else {
18695 if (Subtarget->isThumb2())
18696 isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
18697 Offset, isInc, DAG);
18698 else
18699 isLegal = getARMIndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
18700 Offset, isInc, DAG);
18701 }
18702 if (!isLegal)
18703 return false;
18704
18706 return true;
18707}
18708
18709/// getPostIndexedAddressParts - returns true by value, base pointer and
18710/// offset pointer and addressing mode by reference if this node can be
18711/// combined with a load / store to form a post-indexed load / store.
18713 SDValue &Base,
18714 SDValue &Offset,
18716 SelectionDAG &DAG) const {
18717 EVT VT;
18718 SDValue Ptr;
18719 Align Alignment;
18720 bool isSEXTLoad = false, isNonExt;
18721 bool IsMasked = false;
18722 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
18723 VT = LD->getMemoryVT();
18724 Ptr = LD->getBasePtr();
18725 Alignment = LD->getAlign();
18726 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
18727 isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD;
18728 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
18729 VT = ST->getMemoryVT();
18730 Ptr = ST->getBasePtr();
18731 Alignment = ST->getAlign();
18732 isNonExt = !ST->isTruncatingStore();
18733 } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) {
18734 VT = LD->getMemoryVT();
18735 Ptr = LD->getBasePtr();
18736 Alignment = LD->getAlign();
18737 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
18738 isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD;
18739 IsMasked = true;
18741 VT = ST->getMemoryVT();
18742 Ptr = ST->getBasePtr();
18743 Alignment = ST->getAlign();
18744 isNonExt = !ST->isTruncatingStore();
18745 IsMasked = true;
18746 } else
18747 return false;
18748
18749 if (Subtarget->isThumb1Only()) {
18750 // Thumb-1 can do a limited post-inc load or store as an updating LDM. It
18751 // must be non-extending/truncating, i32, with an offset of 4.
18752 assert(Op->getValueType(0) == MVT::i32 && "Non-i32 post-inc op?!");
18753 if (Op->getOpcode() != ISD::ADD || !isNonExt)
18754 return false;
18755 auto *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1));
18756 if (!RHS || RHS->getZExtValue() != 4)
18757 return false;
18758 if (Alignment < Align(4))
18759 return false;
18760
18761 Offset = Op->getOperand(1);
18762 Base = Op->getOperand(0);
18763 AM = ISD::POST_INC;
18764 return true;
18765 }
18766
18767 bool isInc;
18768 bool isLegal = false;
18769 if (VT.isVector())
18770 isLegal = Subtarget->hasMVEIntegerOps() &&
18771 getMVEIndexedAddressParts(Op, VT, Alignment, isSEXTLoad, IsMasked,
18772 Subtarget->isLittle(), Base, Offset,
18773 isInc, DAG);
18774 else {
18775 if (Subtarget->isThumb2())
18776 isLegal = getT2IndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
18777 isInc, DAG);
18778 else
18779 isLegal = getARMIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
18780 isInc, DAG);
18781 }
18782 if (!isLegal)
18783 return false;
18784
18785 if (Ptr != Base) {
18786 // Swap base ptr and offset to catch more post-index load / store when
18787 // it's legal. In Thumb2 mode, offset must be an immediate.
18788 if (Ptr == Offset && Op->getOpcode() == ISD::ADD &&
18789 !Subtarget->isThumb2())
18791
18792 // Post-indexed load / store update the base pointer.
18793 if (Ptr != Base)
18794 return false;
18795 }
18796
18798 return true;
18799}
18800
18802 KnownBits &Known,
18803 const APInt &DemandedElts,
18804 const SelectionDAG &DAG,
18805 unsigned Depth) const {
18806 unsigned BitWidth = Known.getBitWidth();
18807 Known.resetAll();
18808 switch (Op.getOpcode()) {
18809 default: break;
18810 case ARMISD::ADDC:
18811 case ARMISD::ADDE:
18812 case ARMISD::SUBC:
18813 case ARMISD::SUBE:
18814 // Special cases when we convert a carry to a boolean.
18815 if (Op.getResNo() == 0) {
18816 SDValue LHS = Op.getOperand(0);
18817 SDValue RHS = Op.getOperand(1);
18818 // (ADDE 0, 0, C) will give us a single bit.
18819 if (Op->getOpcode() == ARMISD::ADDE && isNullConstant(LHS) &&
18820 isNullConstant(RHS)) {
18822 return;
18823 }
18824 }
18825 break;
18826 case ARMISD::CMOV: {
18827 // Bits are known zero/one if known on the LHS and RHS.
18828 Known = DAG.computeKnownBits(Op.getOperand(0), Depth+1);
18829 if (Known.isUnknown())
18830 return;
18831
18832 KnownBits KnownRHS = DAG.computeKnownBits(Op.getOperand(1), Depth+1);
18833 Known = KnownBits::commonBits(Known, KnownRHS);
18834 return;
18835 }
18837 ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1));
18838 Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
18839 switch (IntID) {
18840 default: return;
18841 case Intrinsic::arm_ldaex:
18842 case Intrinsic::arm_ldrex: {
18843 EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
18844 unsigned MemBits = VT.getScalarSizeInBits();
18846 return;
18847 }
18848 }
18849 }
18850 case ARMISD::BFI: {
18851 // Conservatively, we can recurse down the first operand
18852 // and just mask out all affected bits.
18853 Known = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
18854
18855 // The operand to BFI is already a mask suitable for removing the bits it
18856 // sets.
18857 ConstantSDNode *CI = cast<ConstantSDNode>(Op.getOperand(2));
18858 const APInt &Mask = CI->getAPIntValue();
18859 Known.Zero &= Mask;
18860 Known.One &= Mask;
18861 return;
18862 }
18863 case ARMISD::VGETLANEs:
18864 case ARMISD::VGETLANEu: {
18865 const SDValue &SrcSV = Op.getOperand(0);
18866 EVT VecVT = SrcSV.getValueType();
18867 assert(VecVT.isVector() && "VGETLANE expected a vector type");
18868 const unsigned NumSrcElts = VecVT.getVectorNumElements();
18869 ConstantSDNode *Pos = cast<ConstantSDNode>(Op.getOperand(1).getNode());
18871 "VGETLANE index out of bounds");
18872 unsigned Idx = Pos->getZExtValue();
18874 Known = DAG.computeKnownBits(SrcSV, DemandedElt, Depth + 1);
18875
18876 EVT VT = Op.getValueType();
18877 const unsigned DstSz = VT.getScalarSizeInBits();
18878 const unsigned SrcSz = VecVT.getVectorElementType().getSizeInBits();
18879 (void)SrcSz;
18880 assert(SrcSz == Known.getBitWidth());
18881 assert(DstSz > SrcSz);
18882 if (Op.getOpcode() == ARMISD::VGETLANEs)
18883 Known = Known.sext(DstSz);
18884 else {
18885 Known = Known.zext(DstSz);
18886 }
18887 assert(DstSz == Known.getBitWidth());
18888 break;
18889 }
18890 case ARMISD::VMOVrh: {
18891 KnownBits KnownOp = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
18892 assert(KnownOp.getBitWidth() == 16);
18893 Known = KnownOp.zext(32);
18894 break;
18895 }
18896 case ARMISD::CSINC:
18897 case ARMISD::CSINV:
18898 case ARMISD::CSNEG: {
18899 KnownBits KnownOp0 = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
18900 KnownBits KnownOp1 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
18901
18902 // The result is either:
18903 // CSINC: KnownOp0 or KnownOp1 + 1
18904 // CSINV: KnownOp0 or ~KnownOp1
18905 // CSNEG: KnownOp0 or KnownOp1 * -1
18906 if (Op.getOpcode() == ARMISD::CSINC)
18908 true, false, KnownOp1, KnownBits::makeConstant(APInt(32, 1)));
18909 else if (Op.getOpcode() == ARMISD::CSINV)
18910 std::swap(KnownOp1.Zero, KnownOp1.One);
18911 else if (Op.getOpcode() == ARMISD::CSNEG)
18914
18916 break;
18917 }
18918 }
18919}
18920
18922 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
18923 TargetLoweringOpt &TLO) const {
18924 // Delay optimization, so we don't have to deal with illegal types, or block
18925 // optimizations.
18926 if (!TLO.LegalOps)
18927 return false;
18928
18929 // Only optimize AND for now.
18930 if (Op.getOpcode() != ISD::AND)
18931 return false;
18932
18933 EVT VT = Op.getValueType();
18934
18935 // Ignore vectors.
18936 if (VT.isVector())
18937 return false;
18938
18939 assert(VT == MVT::i32 && "Unexpected integer type");
18940
18941 // Make sure the RHS really is a constant.
18942 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
18943 if (!C)
18944 return false;
18945
18946 unsigned Mask = C->getZExtValue();
18947
18948 unsigned Demanded = DemandedBits.getZExtValue();
18949 unsigned ShrunkMask = Mask & Demanded;
18950 unsigned ExpandedMask = Mask | ~Demanded;
18951
18952 // If the mask is all zeros, let the target-independent code replace the
18953 // result with zero.
18954 if (ShrunkMask == 0)
18955 return false;
18956
18957 // If the mask is all ones, erase the AND. (Currently, the target-independent
18958 // code won't do this, so we have to do it explicitly to avoid an infinite
18959 // loop in obscure cases.)
18960 if (ExpandedMask == ~0U)
18961 return TLO.CombineTo(Op, Op.getOperand(0));
18962
18963 auto IsLegalMask = [ShrunkMask, ExpandedMask](unsigned Mask) -> bool {
18964 return (ShrunkMask & Mask) == ShrunkMask && (~ExpandedMask & Mask) == 0;
18965 };
18966 auto UseMask = [Mask, Op, VT, &TLO](unsigned NewMask) -> bool {
18967 if (NewMask == Mask)
18968 return true;
18969 SDLoc DL(Op);
18970 SDValue NewC = TLO.DAG.getConstant(NewMask, DL, VT);
18971 SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
18972 return TLO.CombineTo(Op, NewOp);
18973 };
18974
18975 // Prefer uxtb mask.
18976 if (IsLegalMask(0xFF))
18977 return UseMask(0xFF);
18978
18979 // Prefer uxth mask.
18980 if (IsLegalMask(0xFFFF))
18981 return UseMask(0xFFFF);
18982
18983 // [1, 255] is Thumb1 movs+ands, legal immediate for ARM/Thumb2.
18984 // FIXME: Prefer a contiguous sequence of bits for other optimizations.
18985 if (ShrunkMask < 256)
18986 return UseMask(ShrunkMask);
18987
18988 // [-256, -2] is Thumb1 movs+bics, legal immediate for ARM/Thumb2.
18989 // FIXME: Prefer a contiguous sequence of bits for other optimizations.
18990 if ((int)ExpandedMask <= -2 && (int)ExpandedMask >= -256)
18991 return UseMask(ExpandedMask);
18992
18993 // Potential improvements:
18994 //
18995 // We could try to recognize lsls+lsrs or lsrs+lsls pairs here.
18996 // We could try to prefer Thumb1 immediates which can be lowered to a
18997 // two-instruction sequence.
18998 // We could try to recognize more legal ARM/Thumb2 immediates here.
18999
19000 return false;
19001}
19002
19006 unsigned Depth) const {
19007 unsigned Opc = Op.getOpcode();
19008
19009 switch (Opc) {
19010 case ARMISD::ASRL:
19011 case ARMISD::LSRL: {
19012 // If this is result 0 and the other result is unused, see if the demand
19013 // bits allow us to shrink this long shift into a standard small shift in
19014 // the opposite direction.
19015 if (Op.getResNo() == 0 && !Op->hasAnyUseOfValue(1) &&
19016 isa<ConstantSDNode>(Op->getOperand(2))) {
19017 unsigned ShAmt = Op->getConstantOperandVal(2);
19018 if (ShAmt < 32 && OriginalDemandedBits.isSubsetOf(
19019 APInt::getAllOnesValue(32) << (32 - ShAmt)))
19020 return TLO.CombineTo(
19021 Op, TLO.DAG.getNode(
19022 ISD::SHL, SDLoc(Op), MVT::i32, Op.getOperand(1),
19023 TLO.DAG.getConstant(32 - ShAmt, SDLoc(Op), MVT::i32)));
19024 }
19025 break;
19026 }
19027 }
19028
19031}
19032
19033//===----------------------------------------------------------------------===//
19034// ARM Inline Assembly Support
19035//===----------------------------------------------------------------------===//
19036
19038 // Looking for "rev" which is V6+.
19039 if (!Subtarget->hasV6Ops())
19040 return false;
19041
19043 std::string AsmStr = IA->getAsmString();
19045 SplitString(AsmStr, AsmPieces, ";\n");
19046
19047 switch (AsmPieces.size()) {
19048 default: return false;
19049 case 1:
19050 AsmStr = std::string(AsmPieces[0]);
19051 AsmPieces.clear();
19052 SplitString(AsmStr, AsmPieces, " \t,");
19053
19054 // rev $0, $1
19055 if (AsmPieces.size() == 3 &&
19056 AsmPieces[0] == "rev" && AsmPieces[1] == "$0" && AsmPieces[2] == "$1" &&
19057 IA->getConstraintString().compare(0, 4, "=l,l") == 0) {
19059 if (Ty && Ty->getBitWidth() == 32)
19061 }
19062 break;
19063 }
19064
19065 return false;
19066}
19067
19068const char *ARMTargetLowering::LowerXConstraint(EVT ConstraintVT) const {
19069 // At this point, we have to lower this constraint to something else, so we
19070 // lower it to an "r" or "w". However, by doing this we will force the result
19071 // to be in register, while the X constraint is much more permissive.
19072 //
19073 // Although we are correct (we are free to emit anything, without
19074 // constraints), we might break use cases that would expect us to be more
19075 // efficient and emit something else.
19076 if (!Subtarget->hasVFP2Base())
19077 return "r";
19078 if (ConstraintVT.isFloatingPoint())
19079 return "w";
19080 if (ConstraintVT.isVector() && Subtarget->hasNEON() &&
19081 (ConstraintVT.getSizeInBits() == 64 ||
19082 ConstraintVT.getSizeInBits() == 128))
19083 return "w";
19084
19085 return "r";
19086}
19087
19088/// getConstraintType - Given a constraint letter, return the type of
19089/// constraint it is for this target.
19092 unsigned S = Constraint.size();
19093 if (S == 1) {
19094 switch (Constraint[0]) {
19095 default: break;
19096 case 'l': return C_RegisterClass;
19097 case 'w': return C_RegisterClass;
19098 case 'h': return C_RegisterClass;
19099 case 'x': return C_RegisterClass;
19100 case 't': return C_RegisterClass;
19101 case 'j': return C_Immediate; // Constant for movw.
19102 // An address with a single base register. Due to the way we
19103 // currently handle addresses it is the same as an 'r' memory constraint.
19104 case 'Q': return C_Memory;
19105 }
19106 } else if (S == 2) {
19107 switch (Constraint[0]) {
19108 default: break;
19109 case 'T': return C_RegisterClass;
19110 // All 'U+' constraints are addresses.
19111 case 'U': return C_Memory;
19112 }
19113 }
19114 return TargetLowering::getConstraintType(Constraint);
19115}
19116
19117/// Examine constraint type and operand type and determine a weight value.
19118/// This object must already have been set up with the operand type
19119/// and the current alternative constraint selected.
19122 AsmOperandInfo &info, const char *constraint) const {
19124 Value *CallOperandVal = info.CallOperandVal;
19125 // If we don't have a value, we can't do a match,
19126 // but allow it at the lowest weight.
19127 if (!CallOperandVal)
19128 return CW_Default;
19129 Type *type = CallOperandVal->getType();
19130 // Look at the constraint type.
19131 switch (*constraint) {
19132 default:
19134 break;
19135 case 'l':
19136 if (type->isIntegerTy()) {
19137 if (Subtarget->isThumb())
19138 weight = CW_SpecificReg;
19139 else
19140 weight = CW_Register;
19141 }
19142 break;
19143 case 'w':
19144 if (type->isFloatingPointTy())
19145 weight = CW_Register;
19146 break;
19147 }
19148 return weight;
19149}
19150
19151using RCPair = std::pair<unsigned, const TargetRegisterClass *>;
19152
19154 const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
19155 switch (Constraint.size()) {
19156 case 1:
19157 // GCC ARM Constraint Letters
19158 switch (Constraint[0]) {
19159 case 'l': // Low regs or general regs.
19160 if (Subtarget->isThumb())
19161 return RCPair(0U, &ARM::tGPRRegClass);
19162 return RCPair(0U, &ARM::GPRRegClass);
19163 case 'h': // High regs or no regs.
19164 if (Subtarget->isThumb())
19165 return RCPair(0U, &ARM::hGPRRegClass);
19166 break;
19167 case 'r':
19168 if (Subtarget->isThumb1Only())
19169 return RCPair(0U, &ARM::tGPRRegClass);
19170 return RCPair(0U, &ARM::GPRRegClass);
19171 case 'w':
19172 if (VT == MVT::Other)
19173 break;
19174 if (VT == MVT::f32)
19175 return RCPair(0U, &ARM::SPRRegClass);
19176 if (VT.getSizeInBits() == 64)
19177 return RCPair(0U, &ARM::DPRRegClass);
19178 if (VT.getSizeInBits() == 128)
19179 return RCPair(0U, &ARM::QPRRegClass);
19180 break;
19181 case 'x':
19182 if (VT == MVT::Other)
19183 break;
19184 if (VT == MVT::f32)
19185 return RCPair(0U, &ARM::SPR_8RegClass);
19186 if (VT.getSizeInBits() == 64)
19187 return RCPair(0U, &ARM::DPR_8RegClass);
19188 if (VT.getSizeInBits() == 128)
19189 return RCPair(0U, &ARM::QPR_8RegClass);
19190 break;
19191 case 't':
19192 if (VT == MVT::Other)
19193 break;
19194 if (VT == MVT::f32 || VT == MVT::i32)
19195 return RCPair(0U, &ARM::SPRRegClass);
19196 if (VT.getSizeInBits() == 64)
19197 return RCPair(0U, &ARM::DPR_VFP2RegClass);
19198 if (VT.getSizeInBits() == 128)
19199 return RCPair(0U, &ARM::QPR_VFP2RegClass);
19200 break;
19201 }
19202 break;
19203
19204 case 2:
19205 if (Constraint[0] == 'T') {
19206 switch (Constraint[1]) {
19207 default:
19208 break;
19209 case 'e':
19210 return RCPair(0U, &ARM::tGPREvenRegClass);
19211 case 'o':
19212 return RCPair(0U, &ARM::tGPROddRegClass);
19213 }
19214 }
19215 break;
19216
19217 default:
19218 break;
19219 }
19220
19221 if (StringRef("{cc}").equals_insensitive(Constraint))
19222 return std::make_pair(unsigned(ARM::CPSR), &ARM::CCRRegClass);
19223
19224 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
19225}
19226
19227/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
19228/// vector. If it is invalid, don't add anything to Ops.
19230 std::string &Constraint,
19231 std::vector<SDValue>&Ops,
19232 SelectionDAG &DAG) const {
19233 SDValue Result;
19234
19235 // Currently only support length 1 constraints.
19236 if (Constraint.length() != 1) return;
19237
19238 char ConstraintLetter = Constraint[0];
19239 switch (ConstraintLetter) {
19240 default: break;
19241 case 'j':
19242 case 'I': case 'J': case 'K': case 'L':
19243 case 'M': case 'N': case 'O':
19245 if (!C)
19246 return;
19247
19248 int64_t CVal64 = C->getSExtValue();
19249 int CVal = (int) CVal64;
19250 // None of these constraints allow values larger than 32 bits. Check
19251 // that the value fits in an int.
19252 if (CVal != CVal64)
19253 return;
19254
19255 switch (ConstraintLetter) {
19256 case 'j':
19257 // Constant suitable for movw, must be between 0 and
19258 // 65535.
19259 if (Subtarget->hasV6T2Ops() || (Subtarget->hasV8MBaselineOps()))
19260 if (CVal >= 0 && CVal <= 65535)
19261 break;
19262 return;
19263 case 'I':
19264 if (Subtarget->isThumb1Only()) {
19265 // This must be a constant between 0 and 255, for ADD
19266 // immediates.
19267 if (CVal >= 0 && CVal <= 255)
19268 break;
19269 } else if (Subtarget->isThumb2()) {
19270 // A constant that can be used as an immediate value in a
19271 // data-processing instruction.
19272 if (ARM_AM::getT2SOImmVal(CVal) != -1)
19273 break;
19274 } else {
19275 // A constant that can be used as an immediate value in a
19276 // data-processing instruction.
19277 if (ARM_AM::getSOImmVal(CVal) != -1)
19278 break;
19279 }
19280 return;
19281
19282 case 'J':
19283 if (Subtarget->isThumb1Only()) {
19284 // This must be a constant between -255 and -1, for negated ADD
19285 // immediates. This can be used in GCC with an "n" modifier that
19286 // prints the negated value, for use with SUB instructions. It is
19287 // not useful otherwise but is implemented for compatibility.
19288 if (CVal >= -255 && CVal <= -1)
19289 break;
19290 } else {
19291 // This must be a constant between -4095 and 4095. It is not clear
19292 // what this constraint is intended for. Implemented for
19293 // compatibility with GCC.
19294 if (CVal >= -4095 && CVal <= 4095)
19295 break;
19296 }
19297 return;
19298
19299 case 'K':
19300 if (Subtarget->isThumb1Only()) {
19301 // A 32-bit value where only one byte has a nonzero value. Exclude
19302 // zero to match GCC. This constraint is used by GCC internally for
19303 // constants that can be loaded with a move/shift combination.
19304 // It is not useful otherwise but is implemented for compatibility.
19305 if (CVal != 0 && ARM_AM::isThumbImmShiftedVal(CVal))
19306 break;
19307 } else if (Subtarget->isThumb2()) {
19308 // A constant whose bitwise inverse can be used as an immediate
19309 // value in a data-processing instruction. This can be used in GCC
19310 // with a "B" modifier that prints the inverted value, for use with
19311 // BIC and MVN instructions. It is not useful otherwise but is
19312 // implemented for compatibility.
19313 if (ARM_AM::getT2SOImmVal(~CVal) != -1)
19314 break;
19315 } else {
19316 // A constant whose bitwise inverse can be used as an immediate
19317 // value in a data-processing instruction. This can be used in GCC
19318 // with a "B" modifier that prints the inverted value, for use with
19319 // BIC and MVN instructions. It is not useful otherwise but is
19320 // implemented for compatibility.
19321 if (ARM_AM::getSOImmVal(~CVal) != -1)
19322 break;
19323 }
19324 return;
19325
19326 case 'L':
19327 if (Subtarget->isThumb1Only()) {
19328 // This must be a constant between -7 and 7,
19329 // for 3-operand ADD/SUB immediate instructions.
19330 if (CVal >= -7 && CVal < 7)
19331 break;
19332 } else if (Subtarget->isThumb2()) {
19333 // A constant whose negation can be used as an immediate value in a
19334 // data-processing instruction. This can be used in GCC with an "n"
19335 // modifier that prints the negated value, for use with SUB
19336 // instructions. It is not useful otherwise but is implemented for
19337 // compatibility.
19338 if (ARM_AM::getT2SOImmVal(-CVal) != -1)
19339 break;
19340 } else {
19341 // A constant whose negation can be used as an immediate value in a
19342 // data-processing instruction. This can be used in GCC with an "n"
19343 // modifier that prints the negated value, for use with SUB
19344 // instructions. It is not useful otherwise but is implemented for
19345 // compatibility.
19346 if (ARM_AM::getSOImmVal(-CVal) != -1)
19347 break;
19348 }
19349 return;
19350
19351 case 'M':
19352 if (Subtarget->isThumb1Only()) {
19353 // This must be a multiple of 4 between 0 and 1020, for
19354 // ADD sp + immediate.
19355 if ((CVal >= 0 && CVal <= 1020) && ((CVal & 3) == 0))
19356 break;
19357 } else {
19358 // A power of two or a constant between 0 and 32. This is used in
19359 // GCC for the shift amount on shifted register operands, but it is
19360 // useful in general for any shift amounts.
19361 if ((CVal >= 0 && CVal <= 32) || ((CVal & (CVal - 1)) == 0))
19362 break;
19363 }
19364 return;
19365
19366 case 'N':
19367 if (Subtarget->isThumb1Only()) {
19368 // This must be a constant between 0 and 31, for shift amounts.
19369 if (CVal >= 0 && CVal <= 31)
19370 break;
19371 }
19372 return;
19373
19374 case 'O':
19375 if (Subtarget->isThumb1Only()) {
19376 // This must be a multiple of 4 between -508 and 508, for
19377 // ADD/SUB sp = sp + immediate.
19378 if ((CVal >= -508 && CVal <= 508) && ((CVal & 3) == 0))
19379 break;
19380 }
19381 return;
19382 }
19383 Result = DAG.getTargetConstant(CVal, SDLoc(Op), Op.getValueType());
19384 break;
19385 }
19386
19387 if (Result.getNode()) {
19388 Ops.push_back(Result);
19389 return;
19390 }
19391 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
19392}
19393
19395 const SDNode *N, MVT::SimpleValueType SVT) {
19396 assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM ||
19397 N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) &&
19398 "Unhandled Opcode in getDivRemLibcall");
19399 bool isSigned = N->getOpcode() == ISD::SDIVREM ||
19400 N->getOpcode() == ISD::SREM;
19402 switch (SVT) {
19403 default: llvm_unreachable("Unexpected request for libcall!");
19404 case MVT::i8: LC = isSigned ? RTLIB::SDIVREM_I8 : RTLIB::UDIVREM_I8; break;
19405 case MVT::i16: LC = isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break;
19406 case MVT::i32: LC = isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break;
19407 case MVT::i64: LC = isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break;
19408 }
19409 return LC;
19410}
19411
19413 const SDNode *N, LLVMContext *Context, const ARMSubtarget *Subtarget) {
19414 assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM ||
19415 N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) &&
19416 "Unhandled Opcode in getDivRemArgList");
19417 bool isSigned = N->getOpcode() == ISD::SDIVREM ||
19418 N->getOpcode() == ISD::SREM;
19421 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
19422 EVT ArgVT = N->getOperand(i).getValueType();
19423 Type *ArgTy = ArgVT.getTypeForEVT(*Context);
19424 Entry.Node = N->getOperand(i);
19425 Entry.Ty = ArgTy;
19426 Entry.IsSExt = isSigned;
19427 Entry.IsZExt = !isSigned;
19428 Args.push_back(Entry);
19429 }
19430 if (Subtarget->isTargetWindows() && Args.size() >= 2)
19431 std::swap(Args[0], Args[1]);
19432 return Args;
19433}
19434
19435SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
19436 assert((Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() ||
19437 Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() ||
19438 Subtarget->isTargetWindows()) &&
19439 "Register-based DivRem lowering only");
19440 unsigned Opcode = Op->getOpcode();
19441 assert((Opcode == ISD::SDIVREM || Opcode == ISD::UDIVREM) &&
19442 "Invalid opcode for Div/Rem lowering");
19443 bool isSigned = (Opcode == ISD::SDIVREM);
19444 EVT VT = Op->getValueType(0);
19445 Type *Ty = VT.getTypeForEVT(*DAG.getContext());
19446 SDLoc dl(Op);
19447
19448 // If the target has hardware divide, use divide + multiply + subtract:
19449 // div = a / b
19450 // rem = a - b * div
19451 // return {div, rem}
19452 // This should be lowered into UDIV/SDIV + MLS later on.
19453 bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode()
19454 : Subtarget->hasDivideInARMMode();
19455 if (hasDivide && Op->getValueType(0).isSimple() &&
19456 Op->getSimpleValueType(0) == MVT::i32) {
19457 unsigned DivOpcode = isSigned ? ISD::SDIV : ISD::UDIV;
19458 const SDValue Dividend = Op->getOperand(0);
19459 const SDValue Divisor = Op->getOperand(1);
19460 SDValue Div = DAG.getNode(DivOpcode, dl, VT, Dividend, Divisor);
19461 SDValue Mul = DAG.getNode(ISD::MUL, dl, VT, Div, Divisor);
19462 SDValue Rem = DAG.getNode(ISD::SUB, dl, VT, Dividend, Mul);
19463
19464 SDValue Values[2] = {Div, Rem};
19465 return DAG.getNode(ISD::MERGE_VALUES, dl, DAG.getVTList(VT, VT), Values);
19466 }
19467
19469 VT.getSimpleVT().SimpleTy);
19471
19473 DAG.getContext(),
19474 Subtarget);
19475
19478
19479 Type *RetTy = StructType::get(Ty, Ty);
19480
19481 if (Subtarget->isTargetWindows())
19482 InChain = WinDBZCheckDenominator(DAG, Op.getNode(), InChain);
19483
19486 .setCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args))
19487 .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned);
19488
19489 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
19490 return CallInfo.first;
19491}
19492
19493// Lowers REM using divmod helpers
19494// see RTABI section 4.2/4.3
19495SDValue ARMTargetLowering::LowerREM(SDNode *N, SelectionDAG &DAG) const {
19496 // Build return types (div and rem)
19497 std::vector<Type*> RetTyParams;
19499
19500 switch (N->getValueType(0).getSimpleVT().SimpleTy) {
19501 default: llvm_unreachable("Unexpected request for libcall!");
19502 case MVT::i8: RetTyElement = Type::getInt8Ty(*DAG.getContext()); break;
19503 case MVT::i16: RetTyElement = Type::getInt16Ty(*DAG.getContext()); break;
19504 case MVT::i32: RetTyElement = Type::getInt32Ty(*DAG.getContext()); break;
19505 case MVT::i64: RetTyElement = Type::getInt64Ty(*DAG.getContext()); break;
19506 }
19507
19508 RetTyParams.push_back(RetTyElement);
19509 RetTyParams.push_back(RetTyElement);
19512
19513 RTLIB::Libcall LC = getDivRemLibcall(N, N->getValueType(0).getSimpleVT().
19514 SimpleTy);
19517 Subtarget);
19518 bool isSigned = N->getOpcode() == ISD::SREM;
19521
19522 if (Subtarget->isTargetWindows())
19524
19525 // Lower call
19526 CallLoweringInfo CLI(DAG);
19527 CLI.setChain(InChain)
19528 .setCallee(CallingConv::ARM_AAPCS, RetTy, Callee, std::move(Args))
19529 .setSExtResult(isSigned).setZExtResult(!isSigned).setDebugLoc(SDLoc(N));
19530 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
19531
19532 // Return second (rem) result operand (first contains div)
19533 SDNode *ResNode = CallResult.first.getNode();
19534 assert(ResNode->getNumOperands() == 2 && "divmod should return two operands");
19535 return ResNode->getOperand(1);
19536}
19537
19538SDValue
19539ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const {
19540 assert(Subtarget->isTargetWindows() && "unsupported target platform");
19541 SDLoc DL(Op);
19542
19543 // Get the inputs.
19544 SDValue Chain = Op.getOperand(0);
19545 SDValue Size = Op.getOperand(1);
19546
19548 "no-stack-arg-probe")) {
19550 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
19551 SDValue SP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32);
19552 Chain = SP.getValue(1);
19553 SP = DAG.getNode(ISD::SUB, DL, MVT::i32, SP, Size);
19554 if (Align)
19555 SP =
19556 DAG.getNode(ISD::AND, DL, MVT::i32, SP.getValue(0),
19557 DAG.getConstant(-(uint64_t)Align->value(), DL, MVT::i32));
19558 Chain = DAG.getCopyToReg(Chain, DL, ARM::SP, SP);
19559 SDValue Ops[2] = { SP, Chain };
19560 return DAG.getMergeValues(Ops, DL);
19561 }
19562
19564 DAG.getConstant(2, DL, MVT::i32));
19565
19566 SDValue Flag;
19567 Chain = DAG.getCopyToReg(Chain, DL, ARM::R4, Words, Flag);
19568 Flag = Chain.getValue(1);
19569
19571 Chain = DAG.getNode(ARMISD::WIN__CHKSTK, DL, NodeTys, Chain, Flag);
19572
19573 SDValue NewSP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32);
19574 Chain = NewSP.getValue(1);
19575
19576 SDValue Ops[2] = { NewSP, Chain };
19577 return DAG.getMergeValues(Ops, DL);
19578}
19579
19580SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
19581 bool IsStrict = Op->isStrictFPOpcode();
19582 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
19583 const unsigned DstSz = Op.getValueType().getSizeInBits();
19584 const unsigned SrcSz = SrcVal.getValueType().getSizeInBits();
19586 "Unexpected type for custom-lowering FP_EXTEND");
19587
19588 assert((!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) &&
19589 "With both FP DP and 16, any FP conversion is legal!");
19590
19591 assert(!(DstSz == 32 && Subtarget->hasFP16()) &&
19592 "With FP16, 16 to 32 conversion is legal!");
19593
19594 // Converting from 32 -> 64 is valid if we have FP64.
19595 if (SrcSz == 32 && DstSz == 64 && Subtarget->hasFP64()) {
19596 // FIXME: Remove this when we have strict fp instruction selection patterns
19597 if (IsStrict) {
19598 SDLoc Loc(Op);
19600 Loc, Op.getValueType(), SrcVal);
19601 return DAG.getMergeValues({Result, Op.getOperand(0)}, Loc);
19602 }
19603 return Op;
19604 }
19605
19606 // Either we are converting from 16 -> 64, without FP16 and/or
19607 // FP.double-precision or without Armv8-fp. So we must do it in two
19608 // steps.
19609 // Or we are converting from 32 -> 64 without fp.double-precision or 16 -> 32
19610 // without FP16. So we must do a function call.
19611 SDLoc Loc(Op);
19613 MakeLibCallOptions CallOptions;
19614 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
19615 for (unsigned Sz = SrcSz; Sz <= 32 && Sz < DstSz; Sz *= 2) {
19616 bool Supported = (Sz == 16 ? Subtarget->hasFP16() : Subtarget->hasFP64());
19617 MVT SrcVT = (Sz == 16 ? MVT::f16 : MVT::f32);
19618 MVT DstVT = (Sz == 16 ? MVT::f32 : MVT::f64);
19619 if (Supported) {
19620 if (IsStrict) {
19622 {DstVT, MVT::Other}, {Chain, SrcVal});
19623 Chain = SrcVal.getValue(1);
19624 } else {
19626 }
19627 } else {
19629 assert(LC != RTLIB::UNKNOWN_LIBCALL &&
19630 "Unexpected type for custom-lowering FP_EXTEND");
19631 std::tie(SrcVal, Chain) = makeLibCall(DAG, LC, DstVT, SrcVal, CallOptions,
19632 Loc, Chain);
19633 }
19634 }
19635
19636 return IsStrict ? DAG.getMergeValues({SrcVal, Chain}, Loc) : SrcVal;
19637}
19638
19639SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
19640 bool IsStrict = Op->isStrictFPOpcode();
19641
19642 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
19643 EVT SrcVT = SrcVal.getValueType();
19644 EVT DstVT = Op.getValueType();
19645 const unsigned DstSz = Op.getValueType().getSizeInBits();
19646 const unsigned SrcSz = SrcVT.getSizeInBits();
19647 (void)DstSz;
19649 "Unexpected type for custom-lowering FP_ROUND");
19650
19651 assert((!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) &&
19652 "With both FP DP and 16, any FP conversion is legal!");
19653
19654 SDLoc Loc(Op);
19655
19656 // Instruction from 32 -> 16 if hasFP16 is valid
19657 if (SrcSz == 32 && Subtarget->hasFP16())
19658 return Op;
19659
19660 // Lib call from 32 -> 16 / 64 -> [32, 16]
19662 assert(LC != RTLIB::UNKNOWN_LIBCALL &&
19663 "Unexpected type for custom-lowering FP_ROUND");
19664 MakeLibCallOptions CallOptions;
19665 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
19667 std::tie(Result, Chain) = makeLibCall(DAG, LC, DstVT, SrcVal, CallOptions,
19668 Loc, Chain);
19669 return IsStrict ? DAG.getMergeValues({Result, Chain}, Loc) : Result;
19670}
19671
19672void ARMTargetLowering::lowerABS(SDNode *N, SmallVectorImpl<SDValue> &Results,
19673 SelectionDAG &DAG) const {
19674 assert(N->getValueType(0) == MVT::i64 && "Unexpected type (!= i64) on ABS.");
19675 MVT HalfT = MVT::i32;
19676 SDLoc dl(N);
19677 SDValue Hi, Lo, Tmp;
19678
19681 return ;
19682
19683 unsigned OpTypeBits = HalfT.getScalarSizeInBits();
19684 SDVTList VTList = DAG.getVTList(HalfT, MVT::i1);
19685
19686 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0),
19687 DAG.getConstant(0, dl, HalfT));
19688 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0),
19689 DAG.getConstant(1, dl, HalfT));
19690
19691 Tmp = DAG.getNode(ISD::SRA, dl, HalfT, Hi,
19692 DAG.getConstant(OpTypeBits - 1, dl,
19694 Lo = DAG.getNode(ISD::UADDO, dl, VTList, Tmp, Lo);
19695 Hi = DAG.getNode(ISD::ADDCARRY, dl, VTList, Tmp, Hi,
19696 SDValue(Lo.getNode(), 1));
19697 Hi = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Hi);
19698 Lo = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Lo);
19699
19700 Results.push_back(Lo);
19701 Results.push_back(Hi);
19702}
19703
19704bool
19706 // The ARM target isn't yet aware of offsets.
19707 return false;
19708}
19709
19711 if (v == 0xffffffff)
19712 return false;
19713
19714 // there can be 1's on either or both "outsides", all the "inside"
19715 // bits must be 0's
19716 return isShiftedMask_32(~v);
19717}
19718
19719/// isFPImmLegal - Returns true if the target can instruction select the
19720/// specified FP immediate natively. If false, the legalizer will
19721/// materialize the FP immediate as a load from a constant pool.
19723 bool ForCodeSize) const {
19724 if (!Subtarget->hasVFP3Base())
19725 return false;
19726 if (VT == MVT::f16 && Subtarget->hasFullFP16())
19727 return ARM_AM::getFP16Imm(Imm) != -1;
19728 if (VT == MVT::f32 && Subtarget->hasFullFP16() &&
19729 ARM_AM::getFP32FP16Imm(Imm) != -1)
19730 return true;
19731 if (VT == MVT::f32)
19732 return ARM_AM::getFP32Imm(Imm) != -1;
19733 if (VT == MVT::f64 && Subtarget->hasFP64())
19734 return ARM_AM::getFP64Imm(Imm) != -1;
19735 return false;
19736}
19737
19738/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
19739/// MemIntrinsicNodes. The associated MachineMemOperands record the alignment
19740/// specified in the intrinsic calls.
19742 const CallInst &I,
19743 MachineFunction &MF,
19744 unsigned Intrinsic) const {
19745 switch (Intrinsic) {
19746 case Intrinsic::arm_neon_vld1:
19747 case Intrinsic::arm_neon_vld2:
19748 case Intrinsic::arm_neon_vld3:
19749 case Intrinsic::arm_neon_vld4:
19750 case Intrinsic::arm_neon_vld2lane:
19751 case Intrinsic::arm_neon_vld3lane:
19752 case Intrinsic::arm_neon_vld4lane:
19753 case Intrinsic::arm_neon_vld2dup:
19754 case Intrinsic::arm_neon_vld3dup:
19755 case Intrinsic::arm_neon_vld4dup: {
19756 Info.opc = ISD::INTRINSIC_W_CHAIN;
19757 // Conservatively set memVT to the entire set of vectors loaded.
19758 auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
19759 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
19760 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
19761 Info.ptrVal = I.getArgOperand(0);
19762 Info.offset = 0;
19763 Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1);
19764 Info.align = cast<ConstantInt>(AlignArg)->getMaybeAlignValue();
19765 // volatile loads with NEON intrinsics not supported
19766 Info.flags = MachineMemOperand::MOLoad;
19767 return true;
19768 }
19769 case Intrinsic::arm_neon_vld1x2:
19770 case Intrinsic::arm_neon_vld1x3:
19771 case Intrinsic::arm_neon_vld1x4: {
19772 Info.opc = ISD::INTRINSIC_W_CHAIN;
19773 // Conservatively set memVT to the entire set of vectors loaded.
19774 auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
19775 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
19776 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
19777 Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
19778 Info.offset = 0;
19779 Info.align.reset();
19780 // volatile loads with NEON intrinsics not supported
19781 Info.flags = MachineMemOperand::MOLoad;
19782 return true;
19783 }
19784 case Intrinsic::arm_neon_vst1:
19785 case Intrinsic::arm_neon_vst2:
19786 case Intrinsic::arm_neon_vst3:
19787 case Intrinsic::arm_neon_vst4:
19788 case Intrinsic::arm_neon_vst2lane:
19789 case Intrinsic::arm_neon_vst3lane:
19790 case Intrinsic::arm_neon_vst4lane: {
19791 Info.opc = ISD::INTRINSIC_VOID;
19792 // Conservatively set memVT to the entire set of vectors stored.
19793 auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
19794 unsigned NumElts = 0;
19795 for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
19796 Type *ArgTy = I.getArgOperand(ArgI)->getType();
19797 if (!ArgTy->isVectorTy())
19798 break;
19799 NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
19800 }
19801 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
19802 Info.ptrVal = I.getArgOperand(0);
19803 Info.offset = 0;
19804 Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1);
19805 Info.align = cast<ConstantInt>(AlignArg)->getMaybeAlignValue();
19806 // volatile stores with NEON intrinsics not supported
19807 Info.flags = MachineMemOperand::MOStore;
19808 return true;
19809 }
19810 case Intrinsic::arm_neon_vst1x2:
19811 case Intrinsic::arm_neon_vst1x3:
19812 case Intrinsic::arm_neon_vst1x4: {
19813 Info.opc = ISD::INTRINSIC_VOID;
19814 // Conservatively set memVT to the entire set of vectors stored.
19815 auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
19816 unsigned NumElts = 0;
19817 for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
19818 Type *ArgTy = I.getArgOperand(ArgI)->getType();
19819 if (!ArgTy->isVectorTy())
19820 break;
19821 NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
19822 }
19823 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
19824 Info.ptrVal = I.getArgOperand(0);
19825 Info.offset = 0;
19826 Info.align.reset();
19827 // volatile stores with NEON intrinsics not supported
19828 Info.flags = MachineMemOperand::MOStore;
19829 return true;
19830 }
19831 case Intrinsic::arm_mve_vld2q:
19832 case Intrinsic::arm_mve_vld4q: {
19833 Info.opc = ISD::INTRINSIC_W_CHAIN;
19834 // Conservatively set memVT to the entire set of vectors loaded.
19835 Type *VecTy = cast<StructType>(I.getType())->getElementType(1);
19836 unsigned Factor = Intrinsic == Intrinsic::arm_mve_vld2q ? 2 : 4;
19837 Info.memVT = EVT::getVectorVT(VecTy->getContext(), MVT::i64, Factor * 2);
19838 Info.ptrVal = I.getArgOperand(0);
19839 Info.offset = 0;
19840 Info.align = Align(VecTy->getScalarSizeInBits() / 8);
19841 // volatile loads with MVE intrinsics not supported
19842 Info.flags = MachineMemOperand::MOLoad;
19843 return true;
19844 }
19845 case Intrinsic::arm_mve_vst2q:
19846 case Intrinsic::arm_mve_vst4q: {
19847 Info.opc = ISD::INTRINSIC_VOID;
19848 // Conservatively set memVT to the entire set of vectors stored.
19849 Type *VecTy = I.getArgOperand(1)->getType();
19850 unsigned Factor = Intrinsic == Intrinsic::arm_mve_vst2q ? 2 : 4;
19851 Info.memVT = EVT::getVectorVT(VecTy->getContext(), MVT::i64, Factor * 2);
19852 Info.ptrVal = I.getArgOperand(0);
19853 Info.offset = 0;
19854 Info.align = Align(VecTy->getScalarSizeInBits() / 8);
19855 // volatile stores with MVE intrinsics not supported
19856 Info.flags = MachineMemOperand::MOStore;
19857 return true;
19858 }
19859 case Intrinsic::arm_mve_vldr_gather_base:
19860 case Intrinsic::arm_mve_vldr_gather_base_predicated: {
19861 Info.opc = ISD::INTRINSIC_W_CHAIN;
19862 Info.ptrVal = nullptr;
19863 Info.memVT = MVT::getVT(I.getType());
19864 Info.align = Align(1);
19865 Info.flags |= MachineMemOperand::MOLoad;
19866 return true;
19867 }
19868 case Intrinsic::arm_mve_vldr_gather_base_wb:
19869 case Intrinsic::arm_mve_vldr_gather_base_wb_predicated: {
19870 Info.opc = ISD::INTRINSIC_W_CHAIN;
19871 Info.ptrVal = nullptr;
19872 Info.memVT = MVT::getVT(I.getType()->getContainedType(0));
19873 Info.align = Align(1);
19874 Info.flags |= MachineMemOperand::MOLoad;
19875 return true;
19876 }
19877 case Intrinsic::arm_mve_vldr_gather_offset:
19878 case Intrinsic::arm_mve_vldr_gather_offset_predicated: {
19879 Info.opc = ISD::INTRINSIC_W_CHAIN;
19880 Info.ptrVal = nullptr;
19881 MVT DataVT = MVT::getVT(I.getType());
19882 unsigned MemSize = cast<ConstantInt>(I.getArgOperand(2))->getZExtValue();
19883 Info.memVT = MVT::getVectorVT(MVT::getIntegerVT(MemSize),
19884 DataVT.getVectorNumElements());
19885 Info.align = Align(1);
19886 Info.flags |= MachineMemOperand::MOLoad;
19887 return true;
19888 }
19889 case Intrinsic::arm_mve_vstr_scatter_base:
19890 case Intrinsic::arm_mve_vstr_scatter_base_predicated: {
19891 Info.opc = ISD::INTRINSIC_VOID;
19892 Info.ptrVal = nullptr;
19893 Info.memVT = MVT::getVT(I.getArgOperand(2)->getType());
19894 Info.align = Align(1);
19895 Info.flags |= MachineMemOperand::MOStore;
19896 return true;
19897 }
19898 case Intrinsic::arm_mve_vstr_scatter_base_wb:
19899 case Intrinsic::arm_mve_vstr_scatter_base_wb_predicated: {
19900 Info.opc = ISD::INTRINSIC_W_CHAIN;
19901 Info.ptrVal = nullptr;
19902 Info.memVT = MVT::getVT(I.getArgOperand(2)->getType());
19903 Info.align = Align(1);
19904 Info.flags |= MachineMemOperand::MOStore;
19905 return true;
19906 }
19907 case Intrinsic::arm_mve_vstr_scatter_offset:
19908 case Intrinsic::arm_mve_vstr_scatter_offset_predicated: {
19909 Info.opc = ISD::INTRINSIC_VOID;
19910 Info.ptrVal = nullptr;
19911 MVT DataVT = MVT::getVT(I.getArgOperand(2)->getType());
19912 unsigned MemSize = cast<ConstantInt>(I.getArgOperand(3))->getZExtValue();
19913 Info.memVT = MVT::getVectorVT(MVT::getIntegerVT(MemSize),
19914 DataVT.getVectorNumElements());
19915 Info.align = Align(1);
19916 Info.flags |= MachineMemOperand::MOStore;
19917 return true;
19918 }
19919 case Intrinsic::arm_ldaex:
19920 case Intrinsic::arm_ldrex: {
19921 auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
19922 PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType());
19923 Info.opc = ISD::INTRINSIC_W_CHAIN;
19924 Info.memVT = MVT::getVT(PtrTy->getElementType());
19925 Info.ptrVal = I.getArgOperand(0);
19926 Info.offset = 0;
19927 Info.align = DL.getABITypeAlign(PtrTy->getElementType());
19929 return true;
19930 }
19931 case Intrinsic::arm_stlex:
19932 case Intrinsic::arm_strex: {
19933 auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
19934 PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType());
19935 Info.opc = ISD::INTRINSIC_W_CHAIN;
19936 Info.memVT = MVT::getVT(PtrTy->getElementType());
19937 Info.ptrVal = I.getArgOperand(1);
19938 Info.offset = 0;
19939 Info.align = DL.getABITypeAlign(PtrTy->getElementType());
19941 return true;
19942 }
19943 case Intrinsic::arm_stlexd:
19944 case Intrinsic::arm_strexd:
19945 Info.opc = ISD::INTRINSIC_W_CHAIN;
19946 Info.memVT = MVT::i64;
19947 Info.ptrVal = I.getArgOperand(2);
19948 Info.offset = 0;
19949 Info.align = Align(8);
19951 return true;
19952
19953 case Intrinsic::arm_ldaexd:
19954 case Intrinsic::arm_ldrexd:
19955 Info.opc = ISD::INTRINSIC_W_CHAIN;
19956 Info.memVT = MVT::i64;
19957 Info.ptrVal = I.getArgOperand(0);
19958 Info.offset = 0;
19959 Info.align = Align(8);
19961 return true;
19962
19963 default:
19964 break;
19965 }
19966
19967 return false;
19968}
19969
19970/// Returns true if it is beneficial to convert a load of a constant
19971/// to just the constant itself.
19973 Type *Ty) const {
19974 assert(Ty->isIntegerTy());
19975
19976 unsigned Bits = Ty->getPrimitiveSizeInBits();
19977 if (Bits == 0 || Bits > 32)
19978 return false;
19979 return true;
19980}
19981
19983 unsigned Index) const {
19985 return false;
19986
19987 return (Index == 0 || Index == ResVT.getVectorNumElements());
19988}
19989
19991 ARM_MB::MemBOpt Domain) const {
19992 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
19993
19994 // First, if the target has no DMB, see what fallback we can use.
19995 if (!Subtarget->hasDataBarrier()) {
19996 // Some ARMv6 cpus can support data barriers with an mcr instruction.
19997 // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
19998 // here.
19999 if (Subtarget->hasV6Ops() && !Subtarget->isThumb()) {
20000 Function *MCR = Intrinsic::getDeclaration(M, Intrinsic::arm_mcr);
20001 Value* args[6] = {Builder.getInt32(15), Builder.getInt32(0),
20002 Builder.getInt32(0), Builder.getInt32(7),
20003 Builder.getInt32(10), Builder.getInt32(5)};
20004 return Builder.CreateCall(MCR, args);
20005 } else {
20006 // Instead of using barriers, atomic accesses on these subtargets use
20007 // libcalls.
20008 llvm_unreachable("makeDMB on a target so old that it has no barriers");
20009 }
20010 } else {
20011 Function *DMB = Intrinsic::getDeclaration(M, Intrinsic::arm_dmb);
20012 // Only a full system barrier exists in the M-class architectures.
20013 Domain = Subtarget->isMClass() ? ARM_MB::SY : Domain;
20014 Constant *CDomain = Builder.getInt32(Domain);
20015 return Builder.CreateCall(DMB, CDomain);
20016 }
20017}
20018
20019// Based on http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
20021 Instruction *Inst,
20022 AtomicOrdering Ord) const {
20023 switch (Ord) {
20026 llvm_unreachable("Invalid fence: unordered/non-atomic");
20029 return nullptr; // Nothing to do
20031 if (!Inst->hasAtomicStore())
20032 return nullptr; // Nothing to do
20036 if (Subtarget->preferISHSTBarriers())
20037 return makeDMB(Builder, ARM_MB::ISHST);
20038 // FIXME: add a comment with a link to documentation justifying this.
20039 else
20040 return makeDMB(Builder, ARM_MB::ISH);
20041 }
20042 llvm_unreachable("Unknown fence ordering in emitLeadingFence");
20043}
20044
20046 Instruction *Inst,
20047 AtomicOrdering Ord) const {
20048 switch (Ord) {
20051 llvm_unreachable("Invalid fence: unordered/not-atomic");
20054 return nullptr; // Nothing to do
20058 return makeDMB(Builder, ARM_MB::ISH);
20059 }
20060 llvm_unreachable("Unknown fence ordering in emitTrailingFence");
20061}
20062
20063// Loads and stores less than 64-bits are already atomic; ones above that
20064// are doomed anyway, so defer to the default libcall and blame the OS when
20065// things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit
20066// anything for those.
20068 unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
20069 return (Size == 64) && !Subtarget->isMClass();
20070}
20071
20072// Loads and stores less than 64-bits are already atomic; ones above that
20073// are doomed anyway, so defer to the default libcall and blame the OS when
20074// things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit
20075// anything for those.
20076// FIXME: ldrd and strd are atomic if the CPU has LPAE (e.g. A15 has that
20077// guarantee, see DDI0406C ARM architecture reference manual,
20078// sections A8.8.72-74 LDRD)
20081 unsigned Size = LI->getType()->getPrimitiveSizeInBits();
20082 return ((Size == 64) && !Subtarget->isMClass()) ? AtomicExpansionKind::LLOnly
20084}
20085
20086// For the real atomic operations, we have ldrex/strex up to 32 bits,
20087// and up to 64 bits on the non-M profiles
20090 if (AI->isFloatingPointOperation())
20092
20093 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
20094 // implement atomicrmw without spilling. If the target address is also on the
20095 // stack and close enough to the spill slot, this can lead to a situation
20096 // where the monitor always gets cleared and the atomic operation can never
20097 // succeed. So at -O0 lower this operation to a CAS loop.
20098 if (getTargetMachine().getOptLevel() == CodeGenOpt::None)
20100
20101 unsigned Size = AI->getType()->getPrimitiveSizeInBits();
20102 bool hasAtomicRMW = !Subtarget->isThumb() || Subtarget->hasV8MBaselineOps();
20103 return (Size <= (Subtarget->isMClass() ? 32U : 64U) && hasAtomicRMW)
20106}
20107
20108// Similar to shouldExpandAtomicRMWInIR, ldrex/strex can be used up to 32
20109// bits, and up to 64 bits on the non-M profiles.
20112 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
20113 // implement cmpxchg without spilling. If the address being exchanged is also
20114 // on the stack and close enough to the spill slot, this can lead to a
20115 // situation where the monitor always gets cleared and the atomic operation
20116 // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
20117 unsigned Size = AI->getOperand(1)->getType()->getPrimitiveSizeInBits();
20118 bool HasAtomicCmpXchg =
20119 !Subtarget->isThumb() || Subtarget->hasV8MBaselineOps();
20120 if (getTargetMachine().getOptLevel() != 0 && HasAtomicCmpXchg &&
20121 Size <= (Subtarget->isMClass() ? 32U : 64U))
20124}
20125
20127 const Instruction *I) const {
20128 return InsertFencesForAtomic;
20129}
20130
20131// This has so far only been implemented for MachO.
20133 return Subtarget->isTargetMachO();
20134}
20135
20137 if (!Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
20139
20140 // MSVC CRT has a global variable holding security cookie.
20141 M.getOrInsertGlobal("__security_cookie",
20142 Type::getInt8PtrTy(M.getContext()));
20143
20144 // MSVC CRT has a function to validate security cookie.
20145 FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
20146 "__security_check_cookie", Type::getVoidTy(M.getContext()),
20147 Type::getInt8PtrTy(M.getContext()));
20149 F->addAttribute(1, Attribute::AttrKind::InReg);
20150}
20151
20153 // MSVC CRT has a global variable holding security cookie.
20154 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
20155 return M.getGlobalVariable("__security_cookie");
20157}
20158
20160 // MSVC CRT has a function to validate security cookie.
20161 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
20162 return M.getFunction("__security_check_cookie");
20164}
20165
20167 unsigned &Cost) const {
20168 // If we do not have NEON, vector types are not natively supported.
20169 if (!Subtarget->hasNEON())
20170 return false;
20171
20172 // Floating point values and vector values map to the same register file.
20173 // Therefore, although we could do a store extract of a vector type, this is
20174 // better to leave at float as we have more freedom in the addressing mode for
20175 // those.
20176 if (VectorTy->isFPOrFPVectorTy())
20177 return false;
20178
20179 // If the index is unknown at compile time, this is very expensive to lower
20180 // and it is not possible to combine the store with the extract.
20181 if (!isa<ConstantInt>(Idx))
20182 return false;
20183
20184 assert(VectorTy->isVectorTy() && "VectorTy is not a vector type");
20185 unsigned BitWidth = VectorTy->getPrimitiveSizeInBits().getFixedSize();
20186 // We can do a store + vector extract on any vector that fits perfectly in a D
20187 // or Q register.
20188 if (BitWidth == 64 || BitWidth == 128) {
20189 Cost = 0;
20190 return true;
20191 }
20192 return false;
20193}
20194
20196 return Subtarget->hasV6T2Ops();
20197}
20198
20200 return Subtarget->hasV6T2Ops();
20201}
20202
20204 return !Subtarget->hasMinSize() || Subtarget->isTargetWindows();
20205}
20206
20208 Value *Addr,
20209 AtomicOrdering Ord) const {
20210 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
20211 bool IsAcquire = isAcquireOrStronger(Ord);
20212
20213 // Since i64 isn't legal and intrinsics don't get type-lowered, the ldrexd
20214 // intrinsic must return {i32, i32} and we have to recombine them into a
20215 // single i64 here.
20216 if (ValueTy->getPrimitiveSizeInBits() == 64) {
20218 IsAcquire ? Intrinsic::arm_ldaexd : Intrinsic::arm_ldrexd;
20220
20221 Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
20222 Value *LoHi = Builder.CreateCall(Ldrex, Addr, "lohi");
20223
20224 Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
20225 Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
20226 if (!Subtarget->isLittle())
20227 std::swap (Lo, Hi);
20228 Lo = Builder.CreateZExt(Lo, ValueTy, "lo64");
20229 Hi = Builder.CreateZExt(Hi, ValueTy, "hi64");
20230 return Builder.CreateOr(
20231 Lo, Builder.CreateShl(Hi, ConstantInt::get(ValueTy, 32)), "val64");
20232 }
20233
20234 Type *Tys[] = { Addr->getType() };
20235 Intrinsic::ID Int = IsAcquire ? Intrinsic::arm_ldaex : Intrinsic::arm_ldrex;
20237
20238 return Builder.CreateTruncOrBitCast(Builder.CreateCall(Ldrex, Addr), ValueTy);
20239}
20240
20242 IRBuilderBase &Builder) const {
20243 if (!Subtarget->hasV7Ops())
20244 return;
20245 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
20246 Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::arm_clrex));
20247}
20248
20250 Value *Val, Value *Addr,
20251 AtomicOrdering Ord) const {
20252 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
20253 bool IsRelease = isReleaseOrStronger(Ord);
20254
20255 // Since the intrinsics must have legal type, the i64 intrinsics take two
20256 // parameters: "i32, i32". We must marshal Val into the appropriate form
20257 // before the call.
20258 if (Val->getType()->getPrimitiveSizeInBits() == 64) {
20260 IsRelease ? Intrinsic::arm_stlexd : Intrinsic::arm_strexd;
20262 Type *Int32Ty = Type::getInt32Ty(M->getContext());
20263
20264 Value *Lo = Builder.CreateTrunc(Val, Int32Ty, "lo");
20265 Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 32), Int32Ty, "hi");
20266 if (!Subtarget->isLittle())
20267 std::swap(Lo, Hi);
20268 Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
20269 return Builder.CreateCall(Strex, {Lo, Hi, Addr});
20270 }
20271
20272 Intrinsic::ID Int = IsRelease ? Intrinsic::arm_stlex : Intrinsic::arm_strex;
20273 Type *Tys[] = { Addr->getType() };
20275
20276 return Builder.CreateCall(
20277 Strex, {Builder.CreateZExtOrBitCast(
20278 Val, Strex->getFunctionType()->getParamType(0)),
20279 Addr});
20280}
20281
20282
20284 return Subtarget->isMClass();
20285}
20286
20287/// A helper function for determining the number of interleaved accesses we
20288/// will generate when lowering accesses of the given type.
20289unsigned
20291 const DataLayout &DL) const {
20292 return (DL.getTypeSizeInBits(VecTy) + 127) / 128;
20293}
20294
20296 unsigned Factor, FixedVectorType *VecTy, Align Alignment,
20297 const DataLayout &DL) const {
20298
20299 unsigned VecSize = DL.getTypeSizeInBits(VecTy);
20300 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
20301
20302 if (!Subtarget->hasNEON() && !Subtarget->hasMVEIntegerOps())
20303 return false;
20304
20305 // Ensure the vector doesn't have f16 elements. Even though we could do an
20306 // i16 vldN, we can't hold the f16 vectors and will end up converting via
20307 // f32.
20308 if (Subtarget->hasNEON() && VecTy->getElementType()->isHalfTy())
20309 return false;
20310 if (Subtarget->hasMVEIntegerOps() && Factor == 3)
20311 return false;
20312
20313 // Ensure the number of vector elements is greater than 1.
20314 if (VecTy->getNumElements() < 2)
20315 return false;
20316
20317 // Ensure the element type is legal.
20318 if (ElSize != 8 && ElSize != 16 && ElSize != 32)
20319 return false;
20320 // And the alignment if high enough under MVE.
20321 if (Subtarget->hasMVEIntegerOps() && Alignment < ElSize / 8)
20322 return false;
20323
20324 // Ensure the total vector size is 64 or a multiple of 128. Types larger than
20325 // 128 will be split into multiple interleaved accesses.
20326 if (Subtarget->hasNEON() && VecSize == 64)
20327 return true;
20328 return VecSize % 128 == 0;
20329}
20330
20332 if (Subtarget->hasNEON())
20333 return 4;
20334 if (Subtarget->hasMVEIntegerOps())
20337}
20338
20339/// Lower an interleaved load into a vldN intrinsic.
20340///
20341/// E.g. Lower an interleaved load (Factor = 2):
20342/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr, align 4
20343/// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements
20344/// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements
20345///
20346/// Into:
20347/// %vld2 = { <4 x i32>, <4 x i32> } call llvm.arm.neon.vld2(%ptr, 4)
20348/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 0
20349/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1
20352 ArrayRef<unsigned> Indices, unsigned Factor) const {
20353 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
20354 "Invalid interleave factor");
20355 assert(!Shuffles.empty() && "Empty shufflevector input");
20356 assert(Shuffles.size() == Indices.size() &&
20357 "Unmatched number of shufflevectors and indices");
20358
20359 auto *VecTy = cast<FixedVectorType>(Shuffles[0]->getType());
20360 Type *EltTy = VecTy->getElementType();
20361
20362 const DataLayout &DL = LI->getModule()->getDataLayout();
20363 Align Alignment = LI->getAlign();
20364
20365 // Skip if we do not have NEON and skip illegal vector types. We can
20366 // "legalize" wide vector types into multiple interleaved accesses as long as
20367 // the vector types are divisible by 128.
20368 if (!isLegalInterleavedAccessType(Factor, VecTy, Alignment, DL))
20369 return false;
20370
20371 unsigned NumLoads = getNumInterleavedAccesses(VecTy, DL);
20372
20373 // A pointer vector can not be the return type of the ldN intrinsics. Need to
20374 // load integer vectors first and then convert to pointer vectors.
20375 if (EltTy->isPointerTy())
20376 VecTy = FixedVectorType::get(DL.getIntPtrType(EltTy), VecTy);
20377
20378 IRBuilder<> Builder(LI);
20379
20380 // The base address of the load.
20381 Value *BaseAddr = LI->getPointerOperand();
20382
20383 if (NumLoads > 1) {
20384 // If we're going to generate more than one load, reset the sub-vector type
20385 // to something legal.
20386 VecTy = FixedVectorType::get(VecTy->getElementType(),
20387 VecTy->getNumElements() / NumLoads);
20388
20389 // We will compute the pointer operand of each load from the original base
20390 // address using GEPs. Cast the base address to a pointer to the scalar
20391 // element type.
20392 BaseAddr = Builder.CreateBitCast(
20393 BaseAddr,
20394 VecTy->getElementType()->getPointerTo(LI->getPointerAddressSpace()));
20395 }
20396
20397 assert(isTypeLegal(EVT::getEVT(VecTy)) && "Illegal vldN vector type!");
20398
20399 auto createLoadIntrinsic = [&](Value *BaseAddr) {
20400 if (Subtarget->hasNEON()) {
20401 Type *Int8Ptr = Builder.getInt8PtrTy(LI->getPointerAddressSpace());
20402 Type *Tys[] = {VecTy, Int8Ptr};
20403 static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2,
20404 Intrinsic::arm_neon_vld3,
20405 Intrinsic::arm_neon_vld4};
20407 Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys);
20408
20410 Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr));
20411 Ops.push_back(Builder.getInt32(LI->getAlignment()));
20412
20413 return Builder.CreateCall(VldnFunc, Ops, "vldN");
20414 } else {
20415 assert((Factor == 2 || Factor == 4) &&
20416 "expected interleave factor of 2 or 4 for MVE");
20418 Factor == 2 ? Intrinsic::arm_mve_vld2q : Intrinsic::arm_mve_vld4q;
20419 Type *VecEltTy =
20420 VecTy->getElementType()->getPointerTo(LI->getPointerAddressSpace());
20421 Type *Tys[] = {VecTy, VecEltTy};
20424
20426 Ops.push_back(Builder.CreateBitCast(BaseAddr, VecEltTy));
20427 return Builder.CreateCall(VldnFunc, Ops, "vldN");
20428 }
20429 };
20430
20431 // Holds sub-vectors extracted from the load intrinsic return values. The
20432 // sub-vectors are associated with the shufflevector instructions they will
20433 // replace.
20435
20436 for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {
20437 // If we're generating more than one load, compute the base address of
20438 // subsequent loads as an offset from the previous.
20439 if (LoadCount > 0)
20440 BaseAddr = Builder.CreateConstGEP1_32(VecTy->getElementType(), BaseAddr,
20441 VecTy->getNumElements() * Factor);
20442
20443 CallInst *VldN = createLoadIntrinsic(BaseAddr);
20444
20445 // Replace uses of each shufflevector with the corresponding vector loaded
20446 // by ldN.
20447 for (unsigned i = 0; i < Shuffles.size(); i++) {
20448 ShuffleVectorInst *SV = Shuffles[i];
20449 unsigned Index = Indices[i];
20450
20451 Value *SubVec = Builder.CreateExtractValue(VldN, Index);
20452
20453 // Convert the integer vector to pointer vector if the element is pointer.
20454 if (EltTy->isPointerTy())
20455 SubVec = Builder.CreateIntToPtr(
20456 SubVec,
20458
20459 SubVecs[SV].push_back(SubVec);
20460 }
20461 }
20462
20463 // Replace uses of the shufflevector instructions with the sub-vectors
20464 // returned by the load intrinsic. If a shufflevector instruction is
20465 // associated with more than one sub-vector, those sub-vectors will be
20466 // concatenated into a single wide vector.
20467 for (ShuffleVectorInst *SVI : Shuffles) {
20468 auto &SubVec = SubVecs[SVI];
20469 auto *WideVec =
20470 SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];
20471 SVI->replaceAllUsesWith(WideVec);
20472 }
20473
20474 return true;
20475}
20476
20477/// Lower an interleaved store into a vstN intrinsic.
20478///
20479/// E.g. Lower an interleaved store (Factor = 3):
20480/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
20481/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
20482/// store <12 x i32> %i.vec, <12 x i32>* %ptr, align 4
20483///
20484/// Into:
20485/// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
20486/// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
20487/// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
20488/// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4)
20489///
20490/// Note that the new shufflevectors will be removed and we'll only generate one
20491/// vst3 instruction in CodeGen.
20492///
20493/// Example for a more general valid mask (Factor 3). Lower:
20494/// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
20495/// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
20496/// store <12 x i32> %i.vec, <12 x i32>* %ptr
20497///
20498/// Into:
20499/// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
20500/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
20501/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
20502/// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4)
20504 ShuffleVectorInst *SVI,
20505 unsigned Factor) const {
20506 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
20507 "Invalid interleave factor");
20508
20509 auto *VecTy = cast<FixedVectorType>(SVI->getType());
20510 assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
20511
20512 unsigned LaneLen = VecTy->getNumElements() / Factor;
20513 Type *EltTy = VecTy->getElementType();
20514 auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen);
20515
20516 const DataLayout &DL = SI->getModule()->getDataLayout();
20517 Align Alignment = SI->getAlign();
20518
20519 // Skip if we do not have NEON and skip illegal vector types. We can
20520 // "legalize" wide vector types into multiple interleaved accesses as long as
20521 // the vector types are divisible by 128.
20522 if (!isLegalInterleavedAccessType(Factor, SubVecTy, Alignment, DL))
20523 return false;
20524
20525 unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL);
20526
20527 Value *Op0 = SVI->getOperand(0);
20528 Value *Op1 = SVI->getOperand(1);
20530
20531 // StN intrinsics don't support pointer vectors as arguments. Convert pointer
20532 // vectors to integer vectors.
20533 if (EltTy->isPointerTy()) {
20534 Type *IntTy = DL.getIntPtrType(EltTy);
20535
20536 // Convert to the corresponding integer vector.
20537 auto *IntVecTy =
20539 Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
20540 Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
20541
20543 }
20544
20545 // The base address of the store.
20546 Value *BaseAddr = SI->getPointerOperand();
20547
20548 if (NumStores > 1) {
20549 // If we're going to generate more than one store, reset the lane length
20550 // and sub-vector type to something legal.
20551 LaneLen /= NumStores;
20552 SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);
20553
20554 // We will compute the pointer operand of each store from the original base
20555 // address using GEPs. Cast the base address to a pointer to the scalar
20556 // element type.
20557 BaseAddr = Builder.CreateBitCast(
20558 BaseAddr,
20559 SubVecTy->getElementType()->getPointerTo(SI->getPointerAddressSpace()));
20560 }
20561
20562 assert(isTypeLegal(EVT::getEVT(SubVecTy)) && "Illegal vstN vector type!");
20563
20564 auto Mask = SVI->getShuffleMask();
20565
20566 auto createStoreIntrinsic = [&](Value *BaseAddr,
20567 SmallVectorImpl<Value *> &Shuffles) {
20568 if (Subtarget->hasNEON()) {
20569 static const Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2,
20570 Intrinsic::arm_neon_vst3,
20571 Intrinsic::arm_neon_vst4};
20572 Type *Int8Ptr = Builder.getInt8PtrTy(SI->getPointerAddressSpace());
20573 Type *Tys[] = {Int8Ptr, SubVecTy};
20574
20576 SI->getModule(), StoreInts[Factor - 2], Tys);
20577
20579 Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr));
20580 append_range(Ops, Shuffles);
20581 Ops.push_back(Builder.getInt32(SI->getAlignment()));
20582 Builder.CreateCall(VstNFunc, Ops);
20583 } else {
20584 assert((Factor == 2 || Factor == 4) &&
20585 "expected interleave factor of 2 or 4 for MVE");
20587 Factor == 2 ? Intrinsic::arm_mve_vst2q : Intrinsic::arm_mve_vst4q;
20588 Type *EltPtrTy = SubVecTy->getElementType()->getPointerTo(
20589 SI->getPointerAddressSpace());
20590 Type *Tys[] = {EltPtrTy, SubVecTy};
20592 Intrinsic::getDeclaration(SI->getModule(), StoreInts, Tys);
20593
20595 Ops.push_back(Builder.CreateBitCast(BaseAddr, EltPtrTy));
20596 append_range(Ops, Shuffles);
20597 for (unsigned F = 0; F < Factor; F++) {
20598 Ops.push_back(Builder.getInt32(F));
20599 Builder.CreateCall(VstNFunc, Ops);
20600 Ops.pop_back();
20601 }
20602 }
20603 };
20604
20605 for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
20606 // If we generating more than one store, we compute the base address of
20607 // subsequent stores as an offset from the previous.
20608 if (StoreCount > 0)
20609 BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(),
20610 BaseAddr, LaneLen * Factor);
20611
20612 SmallVector<Value *, 4> Shuffles;
20613
20614 // Split the shufflevector operands into sub vectors for the new vstN call.
20615 for (unsigned i = 0; i < Factor; i++) {
20616 unsigned IdxI = StoreCount * LaneLen * Factor + i;
20617 if (Mask[IdxI] >= 0) {
20618 Shuffles.push_back(Builder.CreateShuffleVector(
20619 Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0)));
20620 } else {
20621 unsigned StartMask = 0;
20622 for (unsigned j = 1; j < LaneLen; j++) {
20623 unsigned IdxJ = StoreCount * LaneLen * Factor + j;
20624 if (Mask[IdxJ * Factor + IdxI] >= 0) {
20625 StartMask = Mask[IdxJ * Factor + IdxI] - IdxJ;
20626 break;
20627 }
20628 }
20629 // Note: If all elements in a chunk are undefs, StartMask=0!
20630 // Note: Filling undef gaps with random elements is ok, since
20631 // those elements were being written anyway (with undefs).
20632 // In the case of all undefs we're defaulting to using elems from 0
20633 // Note: StartMask cannot be negative, it's checked in
20634 // isReInterleaveMask
20635 Shuffles.push_back(Builder.CreateShuffleVector(
20636 Op0, Op1, createSequentialMask(StartMask, LaneLen, 0)));
20637 }
20638 }
20639
20640 createStoreIntrinsic(BaseAddr, Shuffles);
20641 }
20642 return true;
20643}
20644
20652
20654 uint64_t &Members) {
20655 if (auto *ST = dyn_cast<StructType>(Ty)) {
20656 for (unsigned i = 0; i < ST->getNumElements(); ++i) {
20657 uint64_t SubMembers = 0;
20658 if (!isHomogeneousAggregate(ST->getElementType(i), Base, SubMembers))
20659 return false;
20660 Members += SubMembers;
20661 }
20662 } else if (auto *AT = dyn_cast<ArrayType>(Ty)) {
20663 uint64_t SubMembers = 0;
20664 if (!isHomogeneousAggregate(AT->getElementType(), Base, SubMembers))
20665 return false;
20666 Members += SubMembers * AT->getNumElements();
20667 } else if (Ty->isFloatTy()) {
20668 if (Base != HA_UNKNOWN && Base != HA_FLOAT)
20669 return false;
20670 Members = 1;
20671 Base = HA_FLOAT;
20672 } else if (Ty->isDoubleTy()) {
20673 if (Base != HA_UNKNOWN && Base != HA_DOUBLE)
20674 return false;
20675 Members = 1;
20676 Base = HA_DOUBLE;
20677 } else if (auto *VT = dyn_cast<VectorType>(Ty)) {
20678 Members = 1;
20679 switch (Base) {
20680 case HA_FLOAT:
20681 case HA_DOUBLE:
20682 return false;
20683 case HA_VECT64:
20684 return VT->getPrimitiveSizeInBits().getFixedSize() == 64;
20685 case HA_VECT128:
20686 return VT->getPrimitiveSizeInBits().getFixedSize() == 128;
20687 case HA_UNKNOWN:
20688 switch (VT->getPrimitiveSizeInBits().getFixedSize()) {
20689 case 64:
20690 Base = HA_VECT64;
20691 return true;
20692 case 128:
20693 Base = HA_VECT128;
20694 return true;
20695 default:
20696 return false;
20697 }
20698 }
20699 }
20700
20701 return (Members > 0 && Members <= 4);
20702}
20703
20704/// Return the correct alignment for the current calling convention.
20706 Type *ArgTy, const DataLayout &DL) const {
20707 const Align ABITypeAlign = DL.getABITypeAlign(ArgTy);
20708 if (!ArgTy->isVectorTy())
20709 return ABITypeAlign;
20710
20711 // Avoid over-aligning vector parameters. It would require realigning the
20712 // stack and waste space for no real benefit.
20713 return std::min(ABITypeAlign, DL.getStackAlignment());
20714}
20715
20716/// Return true if a type is an AAPCS-VFP homogeneous aggregate or one of
20717/// [N x i32] or [N x i64]. This allows front-ends to skip emitting padding when
20718/// passing according to AAPCS rules.
20720 Type *Ty, CallingConv::ID CallConv, bool isVarArg,
20721 const DataLayout &DL) const {
20722 if (getEffectiveCallingConv(CallConv, isVarArg) !=
20724 return false;
20725
20727 uint64_t Members = 0;
20728 bool IsHA = isHomogeneousAggregate(Ty, Base, Members);
20729 LLVM_DEBUG(dbgs() << "isHA: " << IsHA << " "; Ty->dump());
20730
20731 bool IsIntArray = Ty->isArrayTy() && Ty->getArrayElementType()->isIntegerTy();
20732 return IsHA || IsIntArray;
20733}
20734
20736 const Constant *PersonalityFn) const {
20737 // Platforms which do not use SjLj EH may return values in these registers
20738 // via the personality function.
20739 return Subtarget->useSjLjEH() ? Register() : ARM::R0;
20740}
20741
20743 const Constant *PersonalityFn) const {
20744 // Platforms which do not use SjLj EH may return values in these registers
20745 // via the personality function.
20746 return Subtarget->useSjLjEH() ? Register() : ARM::R1;
20747}
20748
20749void ARMTargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
20750 // Update IsSplitCSR in ARMFunctionInfo.
20751 ARMFunctionInfo *AFI = Entry->getParent()->getInfo<ARMFunctionInfo>();
20752 AFI->setIsSplitCSR(true);
20753}
20754
20755void ARMTargetLowering::insertCopiesSplitCSR(
20756 MachineBasicBlock *Entry,
20757 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
20758 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
20759 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
20760 if (!IStart)
20761 return;
20762
20763 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
20764 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
20766 for (const MCPhysReg *I = IStart; *I; ++I) {
20767 const TargetRegisterClass *RC = nullptr;
20768 if (ARM::GPRRegClass.contains(*I))
20769 RC = &ARM::GPRRegClass;
20770 else if (ARM::DPRRegClass.contains(*I))
20771 RC = &ARM::DPRRegClass;
20772 else
20773 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
20774
20775 Register NewVR = MRI->createVirtualRegister(RC);
20776 // Create copy from CSR to a virtual register.
20777 // FIXME: this currently does not emit CFI pseudo-instructions, it works
20778 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
20779 // nounwind. If we want to generalize this later, we may need to emit
20780 // CFI pseudo-instructions.
20781 assert(Entry->getParent()->getFunction().hasFnAttribute(
20782 Attribute::NoUnwind) &&
20783 "Function should be nounwind in insertCopiesSplitCSR!");
20784 Entry->addLiveIn(*I);
20785 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
20786 .addReg(*I);
20787
20788 // Insert the copy-back instructions right before the terminator.
20789 for (auto *Exit : Exits)
20790 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
20791 TII->get(TargetOpcode::COPY), *I)
20792 .addReg(NewVR);
20793 }
20794}
20795
unsigned const MachineRegisterInfo * MRI
static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt)
isVShiftRImm - Check if this is a valid build_vector for the immediate operand of a vector shift righ...
static bool areExtractExts(Value *Ext1, Value *Ext2)
Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth of the vector elements.
static EVT getExtensionTo64Bits(const EVT &OrigVT)
static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG, bool isSigned)
static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG)
static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG)
static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt)
getVShiftImm - Check if this is a valid build_vector for the immediate operand of a vector shift oper...
#define MAKE_CASE(V)
static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG)
static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS, SDValue RHS, SelectionDAG &DAG, const SDLoc &dl)
GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit the specified operations t...
static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V)
static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt)
isVShiftLImm - Check if this is a valid build_vector for the immediate operand of a vector shift left...
static bool isSignExtended(SDNode *N, SelectionDAG &DAG)
static bool isZeroExtended(SDNode *N, SelectionDAG &DAG)
static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls)
Return true if the calling convention is one that we can guarantee TCO for.
static const unsigned PerfectShuffleTable[6561+1]
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
amdgpu aa AMDGPU Address space based Alias Analysis Wrapper
static bool isConstant(const MachineInstr &MI)
amdgpu Simplify well known AMD library false FunctionCallee Callee
amdgpu Simplify well known AMD library false FunctionCallee Value * Arg
This file declares a class to represent arbitrary precision floating point values and provide a varie...
This file implements a class to represent arbitrary precision integral constant values and operations...
static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG)
static bool isStore(int Opcode)
static bool isThumb(const MCSubtargetInfo &STI)
static SDValue PerformExtractEltToVMOVRRD(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, MachineFrameInfo &MFI, const MachineRegisterInfo *MRI, const TargetInstrInfo *TII)
MatchingStackOffset - Return true if the given stack call argument is already available in the same p...
static SDValue PerformVQDMULHCombine(SDNode *N, SelectionDAG &DAG)
static SDValue LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(SDValue Op, SelectionDAG &DAG)
static SDValue LowerBUILD_VECTOR_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerShift(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue FlattenVectorShuffle(ShuffleVectorSDNode *N, SelectionDAG &DAG)
static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue AddRequiredExtensionForVMULL(SDValue N, SelectionDAG &DAG, const EVT &OrigTy, const EVT &ExtTy, unsigned ExtOpcode)
AddRequiredExtensionForVMULL - Add a sign/zero extension to extend the total value size to 64 bits.
static cl::opt< unsigned > ConstpoolPromotionMaxSize("arm-promote-constant-max-size", cl::Hidden, cl::desc("Maximum size of constant to promote into a constant pool"), cl::init(64))
static bool isZeroOrAllOnes(SDValue N, bool AllOnes)
static SDValue LowerINSERT_VECTOR_ELT_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static bool isVTBLMask(ArrayRef< int > M, EVT VT)
static SDValue PerformSUBCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB.
static cl::opt< bool > EnableConstpoolPromotion("arm-promote-constant", cl::Hidden, cl::desc("Enable / disable promotion of unnamed_addr constants into " "constant pools"), cl::init(false))
static SDValue PerformExtractFpToIntStores(StoreSDNode *St, SelectionDAG &DAG)
static SDValue PerformVDUPCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
PerformVDUPCombine - Target-specific dag combine xforms for ARMISD::VDUP.
static SDValue PerformExtractEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST)
static const APInt * isPowerOf2Constant(SDValue V)
static SDValue PerformVCVTCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
PerformVCVTCombine - VCVT (floating-point to fixed-point, Advanced SIMD) can replace combinations of ...
static SDValue PerformVMOVhrCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG)
static SDValue LowerVECTOR_SHUFFLEUsingOneOff(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static bool isValidMVECond(unsigned CC, bool IsFloat)
static SDValue PerformPREDICATE_CASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC)
IntCCToARMCC - Convert a DAG integer condition code to an ARM CC.
static SDValue PerformSTORECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformSTORECombine - Target-specific dag combine xforms for ISD::STORE.
static SDValue ConvertBooleanCarryToCarryFlag(SDValue BoolCarry, SelectionDAG &DAG)
static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static bool isGTorGE(ISD::CondCode CC)
static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a vldN-lane (N > 1) intrinsic,...
static SDValue ParseBFI(SDNode *N, APInt &ToMask, APInt &FromMask)
static bool isReverseMask(ArrayRef< int > M, EVT VT)
static bool isVZIP_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isVZIP_v_undef_Mask - Special case of isVZIPMask for canonical form of "vector_shuffle v,...
static SDValue PerformSELECTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue AddCombineTo64bitUMAAL(SDNode *AddeNode, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformVECTOR_REG_CASTCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue bitcastf32Toi32(SDValue Op, SelectionDAG &DAG)
static bool isVTRNMask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static bool CanInvertMVEVCMP(SDValue N)
static SDValue PerformLongShiftCombine(SDNode *N, SelectionDAG &DAG)
static SDValue AddCombineToVPADD(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformShiftCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST)
PerformShiftCombine - Checks for immediate versions of vector shifts and lowers them.
static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode, ARMCC::CondCodes &CondCode2)
FPCCToARMCC - Convert a DAG fp condition code to an ARM CC.
static void ExpandREAD_REGISTER(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG)
static EVT getVectorTyFromPredicateVector(EVT VT)
static SDValue PerformARMBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
Target-specific dag combine xforms for ARMISD::BUILD_VECTOR.
static bool isSRL16(const SDValue &Op)
static SDValue PerformVMOVrhCombine(SDNode *N, SelectionDAG &DAG)
static SDValue combineSelectAndUseCommutative(SDNode *N, bool AllOnes, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static Register genTPEntry(MachineBasicBlock *TpEntry, MachineBasicBlock *TpLoopBody, MachineBasicBlock *TpExit, Register OpSizeReg, const TargetInstrInfo *TII, DebugLoc Dl, MachineRegisterInfo &MRI)
Adds logic in loop entry MBB to calculate loop iteration count and adds t2WhileLoopSetup and t2WhileL...
static bool isLTorLE(ISD::CondCode CC)
static SDValue PerformVCMPCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformMVEVMULLCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerSDIV_v4i16(SDValue N0, SDValue N1, const SDLoc &dl, SelectionDAG &DAG)
static SDValue PerformBITCASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST)
static SDValue AddCombineTo64bitMLAL(SDNode *AddeSubeNode, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue LowerWRITE_REGISTER(SDValue Op, SelectionDAG &DAG)
static bool checkAndUpdateCPSRKill(MachineBasicBlock::iterator SelectItr, MachineBasicBlock *BB, const TargetRegisterInfo *TRI)
static bool hasNormalLoadOperand(SDNode *N)
hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node are normal,...
static SDValue PerformInsertEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
PerformInsertEltCombine - Target-specific dag combine xforms for ISD::INSERT_VECTOR_ELT.
static SDValue PerformVDUPLANECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformVDUPLANECombine - Target-specific dag combine xforms for ARMISD::VDUPLANE.
static SDValue LowerBuildVectorOfFPTrunc(SDValue BV, SelectionDAG &DAG, const ARMSubtarget *ST)
static cl::opt< unsigned > ConstpoolPromotionMaxTotal("arm-promote-constant-max-total", cl::Hidden, cl::desc("Maximum size of ALL constants to promote into a constant pool"), cl::init(128))
static SDValue LowerTruncatei1(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static RTLIB::Libcall getDivRemLibcall(const SDNode *N, MVT::SimpleValueType SVT)
static SDValue PerformABSCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue SkipLoadExtensionForVMULL(LoadSDNode *LD, SelectionDAG &DAG)
SkipLoadExtensionForVMULL - return a load of the original vector size that does not do any sign/zero ...
static SDValue AddCombineVUZPToVPADDL(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static const MCPhysReg GPRArgRegs[]
static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDCombineWithOperands - Try DAG combinations for an ADD with operands N0 and N1.
static SDValue PromoteMVEPredVector(SDLoc dl, SDValue Pred, EVT VT, SelectionDAG &DAG)
static bool isVZIPMask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
static SDValue PerformORCombineToSMULWBT(SDNode *OR, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static bool isVTRN_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isVTRN_v_undef_Mask - Special case of isVTRNMask for canonical form of "vector_shuffle v,...
static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue FindBFIToCombineWith(SDNode *N)
static SDValue LowerADDSUBSAT(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue ConvertCarryFlagToBooleanCarry(SDValue Flags, EVT VT, SelectionDAG &DAG)
static void checkVSELConstraints(ISD::CondCode CC, ARMCC::CondCodes &CondCode, bool &swpCmpOps, bool &swpVselOps)
static void ReplaceLongIntrinsic(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG)
static bool isS16(const SDValue &Op, SelectionDAG &DAG)
static bool isSRA16(const SDValue &Op)
static SDValue AddCombineBUILD_VECTORToVPADDL(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue LowerVECTOR_SHUFFLEUsingMovs(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static SDValue LowerInterruptReturn(SmallVectorImpl< SDValue > &RetOps, const SDLoc &DL, SelectionDAG &DAG)
static SDValue LowerEXTRACT_VECTOR_ELT_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerSDIV_v4i8(SDValue X, SDValue Y, const SDLoc &dl, SelectionDAG &DAG)
static void expandf64Toi32(SDValue Op, SelectionDAG &DAG, SDValue &RetVal1, SDValue &RetVal2)
static SDValue LowerCONCAT_VECTORS_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformVLDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isSHL16(const SDValue &Op)
static bool isVEXTMask(ArrayRef< int > M, EVT VT, bool &ReverseVEXT, unsigned &Imm)
static SDValue PerformMVEVLDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue PerformADDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
static unsigned getLdOpcode(unsigned LdSize, bool IsThumb1, bool IsThumb2)
Return the load opcode for a given load size.
static bool isLegalT2AddressImmediate(int64_t V, EVT VT, const ARMSubtarget *Subtarget)
static bool isLegalMVEShuffleOp(unsigned PFEntry)
static SDValue PerformSignExtendInregCombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformShuffleVMOVNCombine(ShuffleVectorSDNode *N, SelectionDAG &DAG)
static bool isVUZPMask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG)
PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for ISD::VECTOR_SHUFFLE.
static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG)
SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND, ANY_EXTEND,...
static bool isVMOVNTruncMask(ArrayRef< int > M, EVT ToVT, bool rev)
static SDValue PerformVQMOVNCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static MachineBasicBlock * OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ)
static SDValue PerformFPExtendCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformAddcSubcCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformVSELECTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static TargetLowering::ArgListTy getDivRemArgList(const SDNode *N, LLVMContext *Context, const ARMSubtarget *Subtarget)
static SDValue PerformVECREDUCE_ADDCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl)
getZeroVector - Returns a vector of specified type with all zero elements.
static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG)
static SDValue PerformSplittingToNarrowingStores(StoreSDNode *St, SelectionDAG &DAG)
static bool getT2IndexedAddressParts(SDNode *Ptr, EVT VT, bool isSEXTLoad, SDValue &Base, SDValue &Offset, bool &isInc, SelectionDAG &DAG)
static ARMCC::CondCodes getVCMPCondCode(SDValue N)
static cl::opt< bool > ARMInterworking("arm-interworking", cl::Hidden, cl::desc("Enable / disable ARM interworking (for debugging only)"), cl::init(true))
static void ReplaceREADCYCLECOUNTER(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformORCombineToBFI(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static bool isConditionalZeroOrAllOnes(SDNode *N, bool AllOnes, SDValue &CC, bool &Invert, SDValue &OtherOp, SelectionDAG &DAG)
static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerBUILD_VECTORToVIDUP(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformLOADCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isZeroVector(SDValue N)
static SDValue PerformAddeSubeCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static void ReplaceCMP_SWAP_64Results(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG)
static bool isLowerSaturate(const SDValue LHS, const SDValue RHS, const SDValue TrueVal, const SDValue FalseVal, const ISD::CondCode CC, const SDValue K)
static SDValue LowerPredicateLoad(SDValue Op, SelectionDAG &DAG)
static void emitPostSt(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos, const TargetInstrInfo *TII, const DebugLoc &dl, unsigned StSize, unsigned Data, unsigned AddrIn, unsigned AddrOut, bool IsThumb1, bool IsThumb2)
Emit a post-increment store operation with given size.
static bool isVMOVNMask(ArrayRef< int > M, EVT VT, bool Top, bool SingleSource)
static SDValue CombineBaseUpdate(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
CombineBaseUpdate - Target-specific DAG combine function for VLDDUP, NEON load/store intrinsics,...
static SDValue LowerSaturatingConditional(SDValue Op, SelectionDAG &DAG)
static SDValue PerformSubCSINCCombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformVMOVRRDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformVMOVRRDCombine - Target-specific dag combine xforms for ARMISD::VMOVRRD.
static SDValue PerformVMOVNCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerVectorExtend(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue WinDBZCheckDenominator(SelectionDAG &DAG, SDNode *N, SDValue InChain)
static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static SDValue PerformVMULCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformVMULCombine Distribute (A + B) * C to (A * C) + (B * C) to take advantage of the special multi...
static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG)
static SDValue PerformBFICombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformORCombine - Target-specific dag combine xforms for ISD::OR.
static SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG)
static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG)
static SDValue PerformTruncatingStoreCombine(StoreSDNode *St, SelectionDAG &DAG)
static unsigned SelectPairHalf(unsigned Elements, ArrayRef< int > Mask, unsigned Index)
static void emitPostLd(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos, const TargetInstrInfo *TII, const DebugLoc &dl, unsigned LdSize, unsigned Data, unsigned AddrIn, unsigned AddrOut, bool IsThumb1, bool IsThumb2)
Emit a post-increment load operation with given size.
static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG, const ARMSubtarget *ST, const SDLoc &dl)
static SDValue PerformXORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, Align Alignment, bool isSEXTLoad, bool IsMasked, bool isLE, SDValue &Base, SDValue &Offset, bool &isInc, SelectionDAG &DAG)
std::pair< unsigned, const TargetRegisterClass * > RCPair
static SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp, TargetLowering::DAGCombinerInfo &DCI, bool AllOnes=false)
static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND, ISD::ZERO_EXTEND,...
static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
cl::opt< unsigned > MVEMaxSupportedInterleaveFactor("mve-max-interleave-factor", cl::Hidden, cl::desc("Maximum interleave factor for MVE VLDn to generate."), cl::init(2))
static SDValue isVMOVModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, unsigned SplatBitSize, SelectionDAG &DAG, const SDLoc &dl, EVT &VT, EVT VectorVT, VMOVModImmType type)
isVMOVModifiedImm - Check if the specified splat value corresponds to a valid vector constant for a N...
static SDValue LowerBuildVectorOfFPExt(SDValue BV, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue CombineVMOVDRRCandidateWithVecOp(const SDNode *BC, SelectionDAG &DAG)
BC is a bitcast that is about to be turned into a VMOVDRR.
static SDValue promoteToConstantPool(const ARMTargetLowering *TLI, const GlobalValue *GV, SelectionDAG &DAG, EVT PtrVT, const SDLoc &dl)
static unsigned isNEONTwoResultShuffleMask(ArrayRef< int > ShuffleMask, EVT VT, unsigned &WhichResult, bool &isV_UNDEF)
Check if ShuffleMask is a NEON two-result shuffle (VZIP, VUZP, VTRN), and return the corresponding AR...
static bool BitsProperlyConcatenate(const APInt &A, const APInt &B)
static bool getARMIndexedAddressParts(SDNode *Ptr, EVT VT, bool isSEXTLoad, SDValue &Base, SDValue &Offset, bool &isInc, SelectionDAG &DAG)
static SDValue LowerVecReduce(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG)
static bool allUsersAreInFunction(const Value *V, const Function *F)
Return true if all users of V are within function F, looking through ConstantExprs.
static bool isSingletonVEXTMask(ArrayRef< int > M, EVT VT, unsigned &Imm)
static SDValue PerformVMOVDRRCombine(SDNode *N, SelectionDAG &DAG)
PerformVMOVDRRCombine - Target-specific dag combine xforms for ARMISD::VMOVDRR.
static bool isLowerSaturatingConditional(const SDValue &Op, SDValue &V, SDValue &SatK)
static bool isLegalAddressImmediate(int64_t V, EVT VT, const ARMSubtarget *Subtarget)
isLegalAddressImmediate - Return true if the integer value can be used as the offset of the target ad...
static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static bool isLegalT1AddressImmediate(int64_t V, EVT VT)
static SDValue CombineANDShift(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG)
static SDValue PerformSHLSimplify(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST)
static SDValue PerformADDECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDECombine - Target-specific dag combine transform from ARMISD::ADDC, ARMISD::ADDE,...
static SDValue PerformUMLALCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerTruncate(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformHWLoopCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST)
static SDValue PerformSplittingMVETruncToNarrowingStores(StoreSDNode *St, SelectionDAG &DAG)
static bool isVUZP_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isVUZP_v_undef_Mask - Special case of isVUZPMask for canonical form of "vector_shuffle v,...
static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base, uint64_t &Members)
static SDValue PerformMULCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformVDIVCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
PerformVDIVCombine - VCVT (fixed-point to floating-point, Advanced SIMD) can replace combinations of ...
static SDValue PerformANDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformADDVecReduce(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerPredicateStore(SDValue Op, SelectionDAG &DAG)
static SDValue SearchLoopIntrinsic(SDValue N, ISD::CondCode &CC, int &Imm, bool &Negate)
static bool canChangeToInt(SDValue Op, bool &SeenZero, const ARMSubtarget *Subtarget)
canChangeToInt - Given the fp compare operand, return true if it is suitable to morph to an integer c...
static unsigned getStOpcode(unsigned StSize, bool IsThumb1, bool IsThumb2)
Return the store opcode for a given store size.
static bool IsVUZPShuffleNode(SDNode *N)
static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue AddCombineTo64BitSMLAL16(SDNode *AddcNode, SDNode *AddeNode, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget, MachineInstr &MI, const SDNode *Node)
Attaches vregs to MEMCPY that it will use as scratch registers when it is expanded into LDM/STM.
static bool isFloatingPointZero(SDValue Op)
isFloatingPointZero - Return true if this is +0.0.
static SDValue findMUL_LOHI(SDValue V)
static SDValue LowerVECTOR_SHUFFLE_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformORCombine_i1(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformSplittingMVEEXTToWideningLoad(SDNode *N, SelectionDAG &DAG)
static SDValue PerformSplittingToWideningLoad(SDNode *N, SelectionDAG &DAG)
static void genTPLoopBody(MachineBasicBlock *TpLoopBody, MachineBasicBlock *TpEntry, MachineBasicBlock *TpExit, const TargetInstrInfo *TII, DebugLoc Dl, MachineRegisterInfo &MRI, Register OpSrcReg, Register OpDestReg, Register ElementCountReg, Register TotalIterationsReg, bool IsMemcpy)
Adds logic in the loopBody MBB to generate MVE_VCTP, t2DoLoopDec and t2DoLoopEnd.
static SDValue PerformBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformBUILD_VECTORCombine - Target-specific dag combine xforms for ISD::BUILD_VECTOR.
static SDValue LowerVecReduceF(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformMinMaxCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
PerformMinMaxCombine - Target-specific DAG combining for creating truncating saturates.
This file a TargetTransformInfo::Concept conforming object specific to the ARM target machine.
Function Alias Analysis Results
assume Assume Builder
Atomic ordering constants.
This file contains the simple types necessary to represent the attributes associated with functions a...
SmallVector< MachineOperand, 4 > Cond
BlockVerifier::State From
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< ShadowStackGC > C("shadow-stack", "Very portable GC for uncooperative code generators")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static Optional< bool > isBigEndian(const SmallDenseMap< int64_t, int64_t, 8 > &MemOffset2Idx, int64_t LowestIdx)
Given a map from byte offsets in memory to indices in a load/store, determine if that map corresponds...
#define LLVM_FALLTHROUGH
LLVM_FALLTHROUGH - Mark fallthrough cases in switch statements.
Definition Compiler.h:281
This file contains the declarations for the subclasses of Constant, which represent the different fla...
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
else return RetTy
#define LLVM_DEBUG(X)
Definition Debug.h:122
uint64_t Align
uint64_t Offset
uint64_t Addr
std::string Name
uint32_t Index
uint64_t Size
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static Function * getFunction(Constant *C)
#define op(i)
#define im(i)
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
std::pair< Value *, Value * > ShuffleOps
We are building a shuffle to create V, which is a sequence of insertelement, extractelement pairs.
static Value * LowerCTPOP(LLVMContext &Context, Value *V, Instruction *IP)
Emit the code to lower ctpop of V before the specified instruction IP.
#define RegName(no)
lazy value info
loop Loop Strength Reduction
static M68kRelType getType(unsigned Kind, MCSymbolRefExpr::VariantKind &Modifier, bool &IsPCRel)
#define F(x, y, z)
Definition MD5.cpp:56
#define I(x, y, z)
Definition MD5.cpp:59
#define G(x, y, z)
Definition MD5.cpp:57
This file declares the MachineConstantPool class which is an abstract constant pool to keep track of ...
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first found DebugLoc that has a DILocation, given a range of instructions.
unsigned const TargetRegisterInfo * TRI
unsigned Reg
Promote Memory to Register
Definition Mem2Reg.cpp:110
Module.h This file contains the declarations for the Module class.
uint64_t High
IntegerType * Int32Ty
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
StandardInstrumentations SI(Debug, VerifyEach)
const char LLVMTargetMachineRef LLVMPassBuilderOptionsRef Options
const char LLVMTargetMachineRef TM
R600 Clause Merge
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
SI Lower i1 Copies
separate const offset from Split GEPs to a variadic base and a constant offset for better CSE
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:166
This file describes how to lower LLVM code to machine code.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:455
static X86::CondCode getSwappedCondition(X86::CondCode CC)
Assuming the flags are set by MI(a,b), return the condition code if we modify the instructions such t...
static constexpr int Concat[]
Class for arbitrary precision integers.
Definition APInt.h:70
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1631
bool sgt(const APInt &RHS) const
Signed greater than comparison.
Definition APInt.h:1294
static APInt getAllOnesValue(unsigned numBits)
Get the all-ones value.
Definition APInt.h:567
bool ult(const APInt &RHS) const
Unsigned less than comparison.
Definition APInt.h:1205
static APInt getSplat(unsigned NewLen, const APInt &V)
Return a value containing V broadcasted over NewLen bits.
Definition APInt.cpp:578
uint64_t getLimitedValue(uint64_t Limit=UINT64_MAX) const
If this value is smaller than the specified limit, return it, otherwise return the limit value.
Definition APInt.h:487
bool isAllOnesValue() const
Determine if all bits are set.
Definition APInt.h:401
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:469
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Get a value with low bits set.
Definition APInt.h:667
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Get a value with high bits set.
Definition APInt.h:655
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition APInt.h:593
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1643
virtual const ARMBaseRegisterInfo & getRegisterInfo() const =0
const uint32_t * getSjLjDispatchPreservedMask(const MachineFunction &MF) const
const MCPhysReg * getCalleeSavedRegs(const MachineFunction *MF) const override
Code Generation virtual methods...
static ARMConstantPoolConstant * Create(const Constant *C, unsigned ID)
static ARMConstantPoolMBB * Create(LLVMContext &C, const MachineBasicBlock *mbb, unsigned ID, unsigned char PCAdj)
static ARMConstantPoolSymbol * Create(LLVMContext &C, StringRef s, unsigned ID, unsigned char PCAdj)
ARMConstantPoolValue - ARM specific constantpool value.
ARMFunctionInfo - This class is derived from MachineFunctionInfo and contains private ARM-specific in...
SmallPtrSet< const GlobalVariable *, 2 > & getGlobalsPromotedToConstantPool()
void setArgumentStackToRestore(unsigned v)
void setArgRegsSaveSize(unsigned s)
void setReturnRegsCount(unsigned s)
void markGlobalAsPromotedToConstantPool(const GlobalVariable *GV)
Indicate to the backend that GV has had its storage changed to inside a constant pool.
void setArgumentStackSize(unsigned size)
bool isTargetMachO() const
bool hasVMLxForwarding() const
bool hasFPAO() const
bool isThumb() const
bool hasRetAddrStack() const
bool hasNEON() const
bool isTargetAEABI() const
bool hasV6Ops() const
bool hasARMOps() const
bool supportsTailCall() const
const Triple & getTargetTriple() const
bool hasVFP4Base() const
const ARMBaseInstrInfo * getInstrInfo() const override
bool isThumb1Only() const
bool hasV5TOps() const
bool hasThumb2() const
bool useFPVFMx() const
bool hasFullFP16() const
bool hasFPARMv8Base() const
bool isThumb2() const
bool isTargetWindows() const
bool isGVIndirectSymbol(const GlobalValue *GV) const
True if the GV will be accessed via an indirect symbol.
bool hasBaseDSP() const
const ARMTargetLowering * getTargetLowering() const override
bool useSjLjEH() const
bool hasDivideInThumbMode() const
bool isTargetDarwin() const
const ARMBaseRegisterInfo * getRegisterInfo() const override
bool hasVFP2Base() const
bool isTargetAndroid() const
bool isTargetCOFF() const
bool isTargetGNUAEABI() const
bool hasV8_1MMainlineOps() const
bool hasVFP3Base() const
bool isAPCS_ABI() const
bool useFPVFMx64() const
bool isTargetWatchOS() const
bool preferISHSTBarriers() const
bool hasLOB() const
bool hasFP64() const
bool genLongCalls() const
bool hasMinSize() const
bool isFPBrccSlow() const
bool isTargetIOS() const
bool useNEONForSinglePrecisionFP() const
const InstrItineraryData * getInstrItineraryData() const override
getInstrItins - Return the instruction itineraries based on subtarget selection.
bool isTargetWatchABI() const
bool hasDSP() const
bool hasV7Ops() const
bool hasDataBarrier() const
bool hasAnyDataBarrier() const
bool isAAPCS_ABI() const
bool isLittle() const
bool allowsUnalignedMem() const
bool hasBF16() const
bool isTargetMuslAEABI() const
bool useSoftFloat() const
bool hasFPRegs16() const
bool hasMPExtension() const
bool hasMVEFloatOps() const
bool hasFPRegs() const
bool useFPVFMx16() const
bool isMClass() const
bool hasDivideInARMMode() const
bool hasV6T2Ops() const
unsigned getPrefLoopLogAlignment() const
bool hasV5TEOps() const
bool isTargetHardFloat() const
bool useMulOps() const
bool isTargetELF() const
bool hasV8MBaselineOps() const
bool useNaClTrap() const
bool hasMVEIntegerOps() const
bool hasFP16() const
bool hasPerfMon() const
bool hasAcquireRelease() const
bool genExecuteOnly() const
bool isReadOnly(const GlobalValue *GV) const
bool shouldAlignPointerArgs(CallInst *CI, unsigned &MinSize, unsigned &PrefAlign) const override
Return true if the pointer arguments to CI should be aligned by aligning the object whose address is ...
unsigned getMaxSupportedInterleaveFactor() const override
Get the maximum supported factor for interleaved memory accesses.
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
unsigned getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL) const
Returns the number of interleaved accesses that will be generated when lowering accesses of the given...
bool shouldInsertFencesForAtomic(const Instruction *I) const override
Whether AtomicExpandPass should automatically insert fences and reduce ordering for this atomic.
Align getABIAlignmentForCallingConv(Type *ArgTy, const DataLayout &DL) const override
Return the correct alignment for the current calling convention.
void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
LowerAsmOperandForConstraint - Lower the specified operand into the Ops vector.
bool isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to move this shift by a constant amount though its operand,...
Register getExceptionPointerRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception address on entry to an ...
ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const override
Examine constraint string and operand type and determine a weight value.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
isLegalAddressingMode - Return true if the addressing mode represented by AM is legal for this target...
const ARMSubtarget * getSubtarget() const
bool isLegalT2ScaledAddressingMode(const AddrMode &AM, EVT VT) const
bool isLegalT1ScaledAddressingMode(const AddrMode &AM, EVT VT) const
Returns true if the addressing mode representing by AM is legal for the Thumb1 target,...
bool getPreIndexedAddressParts(SDNode *N, SDValue &Base, SDValue &Offset, ISD::MemIndexedMode &AM, SelectionDAG &DAG) const override
getPreIndexedAddressParts - returns true by value, base pointer and offset pointer and addressing mod...
bool shouldSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Check if sinking I's operands to I's basic block is profitable, because the operands can be folded in...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
ReplaceNodeResults - Replace the results of node with an illegal result type with new values built ou...
void emitAtomicCmpXchgNoStoreLLBalance(IRBuilderBase &Builder) const override
bool shouldExpandShift(SelectionDAG &DAG, SDNode *N) const override
Return true if SHIFT instructions should be expanded to SHIFT_PARTS instructions, and false if a libr...
bool isLegalAddImmediate(int64_t Imm) const override
isLegalAddImmediate - Return true if the specified immediate is legal add immediate,...
bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns true if the given (atomic) store should be expanded by the IR-level AtomicExpand pass into an...
Instruction * emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const override
bool isFNegFree(EVT VT) const override
Return true if an fneg operation is free to the point where it is never worthwhile to replace it with...
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
SDValue PerformMVETruncCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize=false) const override
isFPImmLegal - Returns true if the target can instruction select the specified FP immediate natively.
ConstraintType getConstraintType(StringRef Constraint) const override
getConstraintType - Given a constraint letter, return the type of constraint it is for this target.
bool preferIncOfAddToSubOfNot(EVT VT) const override
These two forms are equivalent: sub y, (xor x, -1) add (add x, 1), y The variant with two add's is IR...
Function * getSSPStackGuardCheck(const Module &M) const override
If the target has a standard stack protection check function that performs validation and error handl...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
void insertSSPDeclarations(Module &M) const override
Inserts necessary declarations for SSP (stack protection) purpose.
SDValue PerformIntrinsicCombine(SDNode *N, DAGCombinerInfo &DCI) const
PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics.
bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags, bool *Fast) const override
allowsMisalignedMemoryAccesses - Returns true if the target allows unaligned memory accesses of the s...
bool ExpandInlineAsm(CallInst *CI) const override
This hook allows the target to expand an inline asm call to be explicit llvm code if it wants to.
SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const
PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV.
Value * getSDagStackGuard(const Module &M) const override
Return the variable that's previously inserted by insertSSPDeclarations, if any, otherwise return nul...
SDValue PerformMVEExtCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool shouldFoldConstantShiftPairToMask(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to fold a pair of shifts into a mask.
bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &OriginalDemandedBits, const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth) const override
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
getSetCCResultType - Return the value type to use for ISD::SETCC.
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
Value * emitStoreConditional(IRBuilderBase &Builder, Value *Val, Value *Addr, AtomicOrdering Ord) const override
Perform a store-conditional operation to Addr.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool isVarArg) const
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const override
createFastISel - This method returns a target specific FastISel object, or null if the target does no...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
This method should be implemented by targets that mark instructions with the 'hasPostISelHook' flag.
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for this result type with this index.
bool isCheapToSpeculateCttz() const override
Return true if it is cheap to speculate a call to intrinsic cttz.
bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const override
getTgtMemIntrinsic - Represent NEON load and store intrinsics as MemIntrinsicNodes.
bool isTruncateFree(Type *SrcTy, Type *DstTy) const override
Return true if it's free to truncate a value of type FromTy to type ToTy.
bool isShuffleMaskLegal(ArrayRef< int > M, EVT VT) const override
isShuffleMaskLegal - Targets can use this to indicate that they only support some VECTOR_SHUFFLE oper...
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Returns true if it is beneficial to convert a load of a constant to just the constant itself.
bool useLoadStackGuardNode() const override
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const override
getRegClassFor - Return the register class that should be used for the specified value type.
std::pair< const TargetRegisterClass *, uint8_t > findRepresentativeClass(const TargetRegisterInfo *TRI, MVT VT) const override
Return the largest legal super-reg register class of the register class for the specified type and it...
bool isZExtFree(SDValue Val, EVT VT2) const override
Return true if zero-extending the specific node Val to type VT2 is free (either because it's implicit...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const override
bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const override
Lower an interleaved store into a vstN intrinsic.
InstructionCost getScalingFactorCost(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS) const override
getScalingFactorCost - Return the cost of the scaling used in addressing mode represented by AM.
bool isCheapToSpeculateCtlz() const override
Return true if it is cheap to speculate a call to intrinsic ctlz.
ARMTargetLowering(const TargetMachine &TM, const ARMSubtarget &STI)
SDValue PerformBRCONDCombine(SDNode *N, SelectionDAG &DAG) const
PerformBRCONDCombine - Target-specific DAG combining for ARMISD::BRCOND.
const char * getTargetNodeName(unsigned Opcode) const override
This method returns the name of a target specific DAG node.
Register getExceptionSelectorRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception typeid on entry to a la...
Type * shouldConvertSplatType(ShuffleVectorInst *SVI) const override
Given a shuffle vector SVI representing a vector splat, return a new scalar type of size equal to SVI...
Value * emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy, Value *Addr, AtomicOrdering Ord) const override
Perform a load-linked operation on Addr, returning a "Value *" with the corresponding pointee type.
Instruction * makeDMB(IRBuilderBase &Builder, ARM_MB::MemBOpt Domain) const
bool isLegalICmpImmediate(int64_t Imm) const override
isLegalICmpImmediate - Return true if the specified immediate is legal icmp immediate,...
const char * LowerXConstraint(EVT ConstraintVT) const override
Try to replace an X constraint, which matches anything, with another that has more specific requireme...
unsigned getJumpTableEncoding() const override
Return the entry encoding for a jump table in the current function.
EVT getOptimalMemOpType(const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
bool isDesirableToTransformToIntegerOp(unsigned Opc, EVT VT) const override
Return true if it is profitable for dag combiner to transform a floating point op of specified opcode...
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool isVarArg) const
bool isLegalInterleavedAccessType(unsigned Factor, FixedVectorType *VecTy, Align Alignment, const DataLayout &DL) const
Returns true if VecTy is a legal interleaved access type.
bool isVectorLoadExtDesirable(SDValue ExtVal) const override
Return true if folding a vector load into ExtVal (a sign, zero, or any extend node) is profitable.
bool canCombineStoreAndExtract(Type *VectorTy, Value *Idx, unsigned &Cost) const override
Return true if the target can combine store(extractelement VectorTy, Idx).
bool lowerInterleavedLoad(LoadInst *LI, ArrayRef< ShuffleVectorInst * > Shuffles, ArrayRef< unsigned > Indices, unsigned Factor) const override
Lower an interleaved load into a vldN intrinsic.
bool useSoftFloat() const override
bool alignLoopsWithOptSize() const override
Should loops be aligned even when the function is marked OptSize (but not MinSize).
SDValue PerformCMOVToBFICombine(SDNode *N, SelectionDAG &DAG) const
bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override
Return true if a truncation from FromTy to ToTy is permitted when deciding whether a call is in tail ...
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
bool functionArgumentNeedsConsecutiveRegisters(Type *Ty, CallingConv::ID CallConv, bool isVarArg, const DataLayout &DL) const override
Returns true if an argument of type Ty needs to be passed in a contiguous block of registers in calli...
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool getPostIndexedAddressParts(SDNode *N, SDNode *Op, SDValue &Base, SDValue &Offset, ISD::MemIndexedMode &AM, SelectionDAG &DAG) const override
getPostIndexedAddressParts - returns true by value, base pointer and offset pointer and addressing mo...
Instruction * emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const override
Inserts in the IR a target-specific intrinsic specifying a fence.
This class represents an incoming formal argument to a Function.
Definition Argument.h:29
const_pointer iterator
Definition ArrayRef.h:50
An instruction that atomically checks whether a specified value is in a memory location,...
an instruction that atomically reads a memory location, combines it with another value,...
bool isFloatingPointOperation() const
bool hasFnAttribute(Attribute::AttrKind Kind) const
Equivalent to hasAttribute(AttributeList::FunctionIndex, Kind) but may be faster.
LLVM Basic Block Representation.
Definition BasicBlock.h:59
The address of a basic block.
Definition Constants.h:848
static BranchProbability getZero()
A "pseudo-class" with methods for operating on BUILD_VECTORs.
CCState - This class holds information needed while lowering arguments and return values.
void getInRegsParamInfo(unsigned InRegsParamRecordIndex, unsigned &BeginReg, unsigned &EndReg) const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
unsigned getNextStackOffset() const
getNextStackOffset - Return the next stack offset such that all stack slots satisfy their alignment r...
unsigned getInRegsParamsProcessed() const
void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
unsigned getInRegsParamsCount() const
CCValAssign - Represent assignment of one arg/retval to a location.
bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getCalledOperand() const
AttributeList getAttributes() const
Return the parameter attributes for this call.
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
static Constant * get(LLVMContext &Context, ArrayRef< ElementTy > Elts)
get() constructor - Return a constant with array type with an element count and element type matching...
Definition Constants.h:691
const APFloat & getValueAPF() const
ConstantFP - Floating Point Values [float, double].
Definition Constants.h:257
static Constant * get(Type *Ty, uint64_t V, bool IsSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This is an important base class in LLVM.
Definition Constant.h:41
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:112
bool isLittleEndian() const
Layout endianness...
Definition DataLayout.h:241
bool isBigEndian() const
Definition DataLayout.h:242
Align getStackAlignment() const
Definition DataLayout.h:274
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition DataLayout.h:498
Align getPreferredAlign(const GlobalVariable *GV) const
Returns the preferred alignment of the specified global.
StringRef getPrivateGlobalPrefix() const
Definition DataLayout.h:330
Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
A debug info location.
Definition DebugLoc.h:33
Diagnostic information for unsupported feature in backend.
This is a fast-path instruction selection class that generates poor code and doesn't support illegal ...
Definition FastISel.h:65
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:650
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
const Function & getFunction() const
Definition Function.h:136
arg_iterator arg_begin()
Definition Function.h:794
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:321
bool hasStructRetAttr() const
Determine if the function returns a structure through first or second pointer argument.
Definition Function.h:695
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition Function.h:356
bool hasExternalWeakLinkage() const
bool hasDLLImportStorageClass() const
Module * getParent()
Get the module that this global value is contained inside of...
bool isStrongDefinitionForLinker() const
Returns true if this global's definition will be the one chosen by the linker.
@ InternalLinkage
Rename collisions when linking (static functions).
Definition GlobalValue.h:55
unsigned isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override
TargetInstrInfo overrides.
Common base class shared among various IRBuilders.
Definition IRBuilder.h:95
const std::string & getConstraintString() const
Definition InlineAsm.h:84
const std::string & getAsmString() const
Definition InlineAsm.h:83
int getOperandCycle(unsigned ItinClassIndx, unsigned OperandIdx) const
Return the cycle for the given class and operand.
bool isEmpty() const
Returns true if there are no itineraries.
bool hasAtomicStore() const
Return true if this atomic instruction stores to memory.
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
const BasicBlock * getParent() const
Definition Instruction.h:94
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Class to represent integer types.
static bool LowerToByteSwap(CallInst *CI)
Try to replace a call instruction with a call to a bswap intrinsic.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Value * getPointerOperand()
unsigned getAlignment() const
Return the alignment of the access that is being performed.
Align getAlign() const
Return the alignment of the access that is being performed.
This class is used to represent ISD::LOAD nodes.
Describe properties that are true of each instruction in the target description file.
unsigned getSchedClass() const
Return the scheduling class for this instruction.
unsigned getNumOperands() const
Return the number of declared MachineOperands for this MachineInstruction.
unsigned getNumDefs() const
Return the number of MachineOperands that are register definitions.
int getOperandConstraint(unsigned OpNum, MCOI::OperandConstraint Constraint) const
Returns the value of the specified operand constraint if it is present.
const MCOperandInfo * OpInfo
bool isOptionalDef() const
Set if this operand is a optional def.
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition MCSymbol.h:41
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
static auto integer_fixedlen_vector_valuetypes()
SimpleValueType SimpleTy
bool isInteger() const
Return true if this is an integer or a vector integer type.
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
static auto integer_valuetypes()
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
static auto fixedlen_vector_valuetypes()
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
static MVT getIntegerVT(unsigned BitWidth)
static auto fp_valuetypes()
const BasicBlock * getBasicBlock() const
Return the LLVM basic block that this instance corresponded to originally.
bool isLiveIn(MCPhysReg Reg, LaneBitmask LaneMask=LaneBitmask::getAll()) const
Return true if the specified register is in the live in set.
bool canFallThrough()
Return true if the block can implicitly transfer control to the block after it by falling off the end...
std::vector< MachineBasicBlock * >::iterator succ_iterator
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
MachineBasicBlock * getFallThrough()
Return the fallthrough block if the block can implicitly transfer control to the block after it by fa...
MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
instr_iterator erase(instr_iterator I)
Remove an instruction from the instruction list and delete it.
The MachineConstantPool class keeps track of constants referenced by a function which must be spilled...
unsigned getConstantPoolIndex(const Constant *C, Align Alignment)
getConstantPoolIndex - Create a new entry in the constant pool or return an existing one.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
int CreateStackObject(uint64_t Size, Align Alignment, bool isSpillSlot, const AllocaInst *Alloca=nullptr, uint8_t ID=0)
Create a new statically sized stack object, returning a nonnegative identifier to represent it.
void setFrameAddressIsTaken(bool T)
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
void computeMaxCallFrameSize(const MachineFunction &MF)
Computes the maximum size of a callframe and the AdjustsStack property.
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
bool hasVAStart() const
Returns true if the function calls the llvm.va_start intrinsic.
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
bool isFixedObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to a fixed stack object.
int getFunctionContextIndex() const
Return the index for the function context object.
Properties which a MachineFunction may have at a given point in time.
MachineFunctionProperties & reset(Property P)
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, uint64_t s, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *bb=nullptr)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
unsigned getFunctionNumber() const
getFunctionNumber - Return a unique ID for the current function.
MachineJumpTableInfo * getOrCreateJumpTableInfo(unsigned JTEntryKind)
getOrCreateJumpTableInfo - Get the JumpTableInfo for this function, if it does already exist,...
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineConstantPool * getConstantPool()
getConstantPool - Return the constant pool object for the current function.
const MachineFunctionProperties & getProperties() const
Get the function properties.
bool hasCallSiteLandingPad(MCSymbol *Sym)
Return true if the landing pad Eh symbol has an associated call site.
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
void insert(iterator MBBI, MachineBasicBlock *MBB)
SmallVectorImpl< unsigned > & getCallSiteLandingPad(MCSymbol *Sym)
Get the call site indexes for a landing pad EH symbol.
const MachineInstrBuilder & addExternalSymbol(const char *FnName, unsigned TargetFlags=0) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addConstantPoolIndex(unsigned Idx, int Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addRegMask(const uint32_t *Mask) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addJumpTableIndex(unsigned Idx, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
bool definesRegister(Register Reg, const TargetRegisterInfo *TRI=nullptr) const
Return true if the MachineInstr fully defines the specified register.
bool readsRegister(Register Reg, const TargetRegisterInfo *TRI=nullptr) const
Return true if the MachineInstr reads the specified register.
const MachineOperand & getOperand(unsigned i) const
unsigned createJumpTableIndex(const std::vector< MachineBasicBlock * > &DestBBs)
createJumpTableIndex - Create a new jump table.
@ EK_Inline
EK_Inline - Jump table entries are emitted inline at their point of use.
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
MachineOperand class - Representation of each machine instruction operand.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setReg(Register Reg)
Change the register this operand corresponds to.
static MachineOperand CreateImm(int64_t Val)
Register getReg() const
getReg - Returns the register number.
void setIsDef(bool Val=true)
Change a def to a use, or a use to a def.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
This class is used to represent an MLOAD node.
This class is used to represent an MSTORE node.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition Module.cpp:401
virtual void print(raw_ostream &OS, const Module *M) const
print - Print out the internal state of the pass.
Definition Pass.cpp:125
Class to represent pointers.
Type * getElementType() const
Wrapper class representing virtual and physical registers.
Definition Register.h:19
static bool isVirtualRegister(unsigned Reg)
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:71
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
This class provides iterator support for SDUse operands that use a specific SDNode.
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
iterator_range< use_iterator > uses()
static bool hasPredecessorHelper(const SDNode *N, SmallPtrSetImpl< const SDNode * > &Visited, SmallVectorImpl< const SDNode * > &Worklist, unsigned int MaxSteps=0, bool TopologicalPrune=false)
Returns true if N is a predecessor of any node in Worklist.
unsigned getNumOperands() const
Return the number of values used by this operation.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
use_iterator use_begin() const
Provide iteration support to walk over all uses of an SDNode.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
bool isUndef() const
Return true if the type of the node type undefined.
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
const SDValue & getOperand(unsigned i) const
uint64_t getScalarValueSizeInBits() const
unsigned getResNo() const
get the index which selects a specific result in the SDNode
uint64_t getConstantOperandVal(unsigned i) const
unsigned getOpcode() const
unsigned getNumOperands() const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS)
Helper function to make it easier to build Select's if you just have operands and don't want to check...
SDValue getStackArgumentTokenFactor(SDValue Chain)
Compute a TokenFactor to force all the incoming stack arguments to be loaded from the stack.
const TargetSubtargetInfo & getSubtarget() const
SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
SDValue getTargetConstantPool(const Constant *C, EVT VT, MaybeAlign Align=None, int Offset=0, unsigned TargetFlags=0)
SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
void addNoMergeSiteInfo(const SDNode *Node, bool NoMerge)
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
SDValue getTargetJumpTable(int JTI, EVT VT, unsigned TargetFlags=0)
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
MaybeAlign InferPtrAlign(SDValue Ptr) const
Infer alignment of a load / store address.
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
SDValue getRegister(unsigned Reg, EVT VT)
SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
SDValue getExternalSymbol(const char *Sym, EVT VT)
const TargetMachine & getTarget() const
SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, unsigned Reg, SDValue N)
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond)
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, uint64_t Size=0, const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
SDValue getValueType(EVT)
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, unsigned Reg, EVT VT)
SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getRegisterMask(const uint32_t *RegMask)
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
SDValue getCondCode(ISD::CondCode Cond)
bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
SDValue CreateStackTemporary(TypeSize Bytes, Align Alignment)
Create a stack temporary based on the size in bytes and the alignment.
SDNode * getNodeIfExists(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops, const SDNodeFlags Flags)
Get the specified node if it's already available, or else return NULL.
void addCallSiteInfo(const SDNode *CallNode, CallSiteInfoImpl &&CallInfo)
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
SDValue getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Base, SDValue Offset, SDValue Mask, SDValue Src0, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, ISD::LoadExtType, bool IsExpanding=false)
SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
SDValue getLogicalNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a logical NOT operation as (XOR Val, BooleanOne).
This instruction constructs a fixed permutation of two input vectors.
static bool isIdentityMask(ArrayRef< int > Mask)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
VectorType * getType() const
Overload to return most specific vector type.
static void getShuffleMask(const Constant *Mask, SmallVectorImpl< int > &Result)
Convert the input shuffle mask operand to a vector of integers.
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
static bool isSplatMask(const int *Mask, EVT VT)
ArrayRef< int > getMask() const
typename SuperClass::iterator iterator
An instruction for storing to memory.
This class is used to represent ISD::STORE nodes.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:58
const unsigned char * bytes_end() const
Definition StringRef.h:135
LLVM_NODISCARD size_t size() const
size - Get the string size.
Definition StringRef.h:157
const unsigned char * bytes_begin() const
Definition StringRef.h:132
static StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition Type.cpp:372
TargetInstrInfo - Interface to description of machine instruction set.
Provides information about what library functions are available for the current target.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
void setTargetDAGCombine(ISD::NodeType NT)
Targets should invoke this method for each target independent node that they want to provide a custom...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
void setCmpLibcallCC(RTLIB::Libcall Call, ISD::CondCode CC)
Override the default CondCode to be used to test the result of the comparison libcall against zero.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
CallingConv::ID getLibcallCallingConv(RTLIB::Libcall Call) const
Get the CallingConv that should be used for the specified libcall.
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
void setIndexedStoreAction(unsigned IdxMode, MVT VT, LegalizeAction Action)
Indicate that the specified indexed store does or does not work with the specified type and indicate ...
void setMinStackArgumentAlignment(Align Alignment)
Set the minimum stack alignment of an argument.
const TargetMachine & getTargetMachine() const
void setLibcallCallingConv(RTLIB::Libcall Call, CallingConv::ID CC)
Set the CallingConv that should be used for the specified libcall.
void setIndexedMaskedLoadAction(unsigned IdxMode, MVT VT, LegalizeAction Action)
Indicate that the specified indexed masked load does or does not work with the specified type and ind...
virtual Value * getSDagStackGuard(const Module &M) const
Return the variable that's previously inserted by insertSSPDeclarations, if any, otherwise return nul...
void setPrefLoopAlignment(Align Alignment)
Set the target's preferred loop alignment.
virtual Function * getSSPStackGuardCheck(const Module &M) const
If the target has a standard stack protection check function that performs validation and error handl...
Sched::Preference getSchedulingPreference() const
Return target scheduling preference.
void setMinFunctionAlignment(Align Alignment)
Set the target's minimum function alignment.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
EVT getShiftAmountTy(EVT LHSTy, const DataLayout &DL, bool LegalTypes=true) const
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
void setLibcallName(RTLIB::Libcall Call, const char *Name)
Rename the default libcall routine name for the specified libcall.
virtual unsigned getMaxSupportedInterleaveFactor() const
Get the maximum supported factor for interleaved memory accesses.
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setIndexedMaskedStoreAction(unsigned IdxMode, MVT VT, LegalizeAction Action)
Indicate that the specified indexed masked store does or does not work with the specified type and in...
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
virtual std::pair< const TargetRegisterClass *, uint8_t > findRepresentativeClass(const TargetRegisterInfo *TRI, MVT VT) const
Return the largest legal super-reg register class of the register class for the specified type and it...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
LegalizeTypeAction getTypeAction(LLVMContext &Context, EVT VT) const
Return how we should legalize values of this type, either it is already legal (return 'Legal') or we ...
const char * getLibcallName(RTLIB::Libcall Call) const
Get the libcall routine name for the specified libcall.
std::vector< ArgListEntry > ArgListTy
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
virtual void insertSSPDeclarations(Module &M) const
Inserts necessary declarations for SSP (stack protection) purpose.
void setIndexedLoadAction(unsigned IdxMode, MVT VT, LegalizeAction Action)
Indicate that the specified indexed load does or does not work with the specified type and indicate w...
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
bool SimplifyDemandedVectorElts(SDValue Op, const APInt &DemandedEltMask, APInt &KnownUndef, APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Vector Op.
void softenSetCCOperands(SelectionDAG &DAG, EVT VT, SDValue &NewLHS, SDValue &NewRHS, ISD::CondCode &CCCode, const SDLoc &DL, const SDValue OldLHS, const SDValue OldRHS) const
Soften the operands of a comparison.
std::pair< SDValue, SDValue > makeLibCall(SelectionDAG &DAG, RTLIB::Libcall LC, EVT RetVT, ArrayRef< SDValue > Ops, MakeLibCallOptions CallOptions, const SDLoc &dl, SDValue Chain=SDValue()) const
Returns a pair of (return value, chain).
virtual void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
virtual SDValue LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA, SelectionDAG &DAG) const
Lower TLS global address SDNode for target independent emulated TLS model.
std::pair< SDValue, SDValue > LowerCallTo(CallLoweringInfo &CLI) const
This function lowers an abstract call to a function into an actual call.
bool isPositionIndependent() const
virtual ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const
Examine constraint string and operand type and determine a weight value.
SDValue buildLegalVectorShuffle(EVT VT, const SDLoc &DL, SDValue N0, SDValue N1, MutableArrayRef< int > Mask, SelectionDAG &DAG) const
Tries to build a legal vector shuffle using the provided parameters or equivalent variations.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0) const
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
bool verifyReturnAddressArgumentIsConstant(SDValue Op, SelectionDAG &DAG) const
bool expandABS(SDNode *N, SDValue &Result, SelectionDAG &DAG, bool IsNegative=false) const
Expand ABS nodes.
bool isConstTrueVal(const SDNode *N) const
Return if the N is a constant or constant vector equal to the true value from getBooleanContents().
Primary interface to the complete machine description for the target machine.
TLSModel::Model getTLSModel(const GlobalValue *GV) const
Returns the TLS model which should be used for the given global variable.
const Triple & getTargetTriple() const
bool useEmulatedTLS() const
Returns true if this target uses emulated TLS.
virtual const TargetSubtargetInfo * getSubtargetImpl(const Function &) const
Virtual method implemented by subclasses that returns a reference to that target's TargetSubtargetInf...
TargetOptions Options
unsigned EnableFastISel
EnableFastISel - This flag enables fast-path instruction selection which trades away generated code q...
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:45
ObjectFormatType getObjectFormat() const
getFormat - Get the object format for this triple.
Definition Triple.h:337
bool isOSMSVCRT() const
Is this a "Windows" OS targeting a "MSVCRT.dll" environment.
Definition Triple.h:584
bool isOSVersionLT(unsigned Major, unsigned Minor=0, unsigned Micro=0) const
isOSVersionLT - Helper function for doing comparisons against version numbers included in the target ...
Definition Triple.h:425
bool isWindowsMSVCEnvironment() const
Checks if the environment could be MSVC.
Definition Triple.h:557
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:83
ScalarTy getFixedSize() const
Definition TypeSize.h:426
static TypeSize Fixed(ScalarTy MinVal)
Definition TypeSize.h:423
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:204
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:237
static IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:203
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:228
static Type * getVoidTy(LLVMContext &C)
Definition Type.cpp:186
static IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:201
static PointerType * getInt8PtrTy(LLVMContext &C, unsigned AS=0)
Definition Type.cpp:255
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:128
static IntegerType * getInt16Ty(LLVMContext &C)
Definition Type.cpp:202
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:141
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:127
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:153
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition Type.h:189
A Use represents the edge between a Value definition and its users.
Definition Use.h:44
const Use & getOperandUse(unsigned i) const
Definition User.h:182
Value * getOperand(unsigned i) const
Definition User.h:169
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
use_iterator use_begin()
Definition Value.h:361
Base class of all SIMD vector types.
Type * getElementType() const
Implementation for an ilist node.
Definition ilist_node.h:39
self_iterator getIterator()
Definition ilist_node.h:81
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
static CondCodes getOppositeCondition(CondCodes CC)
Definition ARMBaseInfo.h:48
@ SECREL
Thread Pointer Offset.
@ SBREL
Section Relative (Windows TLS)
@ GOTTPOFF
Global Offset Table, PC Relative.
@ TPOFF
Global Offset Table, Thread Pointer Offset.
TOF
Target Operand Flag enum.
@ MO_NONLAZY
MO_NONLAZY - This is an independent flag, on a symbol operand "FOO" it represents a symbol which,...
@ MO_SBREL
MO_SBREL - On a symbol operand, this represents a static base relative relocation.
@ MO_DLLIMPORT
MO_DLLIMPORT - On a symbol operand, this represents that the reference to the symbol is for an import...
@ MO_GOT
MO_GOT - On a symbol operand, this represents a GOT relative relocation.
@ MO_COFFSTUB
MO_COFFSTUB - On a symbol operand "FOO", this indicates that the reference is actually to the "....
static ShiftOpc getShiftOpcForNode(unsigned Opcode)
int getSOImmVal(unsigned Arg)
getSOImmVal - Given a 32-bit immediate, if it is something that can fit into an shifter_operand immed...
int getFP32Imm(const APInt &Imm)
getFP32Imm - Return an 8-bit floating-point version of the 32-bit floating-point value.
uint64_t decodeVMOVModImm(unsigned ModImm, unsigned &EltBits)
decodeVMOVModImm - Decode a NEON/MVE modified immediate value into the element value and the element ...
unsigned getAM2Offset(unsigned AM2Opc)
bool isThumbImmShiftedVal(unsigned V)
isThumbImmShiftedVal - Return true if the specified value can be obtained by left shifting a 8-bit im...
int getT2SOImmVal(unsigned Arg)
getT2SOImmVal - Given a 32-bit immediate, if it is something that can fit into a Thumb-2 shifter_oper...
unsigned createVMOVModImm(unsigned OpCmode, unsigned Val)
int getFP64Imm(const APInt &Imm)
getFP64Imm - Return an 8-bit floating-point version of the 64-bit floating-point value.
int getFP16Imm(const APInt &Imm)
getFP16Imm - Return an 8-bit floating-point version of the 16-bit floating-point value.
unsigned getSORegOpc(ShiftOpc ShOp, unsigned Imm)
int getFP32FP16Imm(const APInt &Imm)
If this is a FP16Imm encoded as a fp32 value, return the 8-bit encoding for it.
AddrOpc getAM2Op(unsigned AM2Opc)
bool isBitFieldInvertedMask(unsigned v)
const unsigned RoundingBitsPos
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo)
std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition BitmaskEnum.h:80
@ ARM_APCS
ARM_APCS - ARM Procedure Calling Standard calling convention (obsolete, but still used on some target...
@ CFGuard_Check
Special calling convention on Windows for calling the Control Guard Check ICall funtion.
Definition CallingConv.h:87
@ ARM_AAPCS
ARM_AAPCS - ARM Architecture Procedure Calling Standard calling convention (aka EABI).
@ Fast
Fast - This calling convention attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:42
@ Tail
Tail - This calling convention attemps to make calls as fast as possible while guaranteeing that tail...
Definition CallingConv.h:81
@ SwiftTail
SwiftTail - This follows the Swift calling convention in how arguments are passed but guarantees tail...
Definition CallingConv.h:92
@ ARM_AAPCS_VFP
ARM_AAPCS_VFP - Same as ARM_AAPCS, but uses hard floating point ABI.
@ C
C - The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
bool isNON_EXTLoad(const SDNode *N)
Returns true if the specified node is a non-extending load.
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:702
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition ISDOpcodes.h:236
@ STACKRESTORE
STACKRESTORE has two operands, an input chain and a pointer to restore to it returns an output chain.
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
@ CTLZ_ZERO_UNDEF
Definition ISDOpcodes.h:675
@ STRICT_FSETCC
STRICT_FSETCC/STRICT_FSETCCS - Constrained versions of SETCC, used for floating-point operands only.
Definition ISDOpcodes.h:462
@ FLT_ROUNDS_
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition ISDOpcodes.h:825
@ EH_SJLJ_LONGJMP
OUTCHAIN = EH_SJLJ_LONGJMP(INCHAIN, buffer) This corresponds to the eh.sjlj.longjmp intrinsic.
Definition ISDOpcodes.h:147
@ FGETSIGN
INT = FGETSIGN(FP) - Return the sign bit of the specified floating point value as an integer 0/1 valu...
Definition ISDOpcodes.h:480
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:250
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:666
@ VAEND
VAEND, VASTART - VAEND and VASTART have three operands: an input chain, pointer, and a SRCVALUE.
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, ptr, val) This corresponds to "store atomic" instruction.
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:239
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition ISDOpcodes.h:921
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:732
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:466
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:199
@ EH_SJLJ_SETUP_DISPATCH
OUTCHAIN = EH_SJLJ_SETUP_DISPATCH(INCHAIN) The target initializes the dispatch table here.
Definition ISDOpcodes.h:151
@ GlobalAddress
Definition ISDOpcodes.h:78
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:739
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:519
@ VECREDUCE_FMAX
FMIN/FMAX nodes can have flags, for NaN/NoNaN variants.
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:377
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition ISDOpcodes.h:640
@ ATOMIC_FENCE
OUTCHAIN = ATOMIC_FENCE(INCHAIN, ordering, scope) This corresponds to the fence instruction.
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition ISDOpcodes.h:255
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
Definition ISDOpcodes.h:858
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition ISDOpcodes.h:848
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:229
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ GlobalTLSAddress
Definition ISDOpcodes.h:79
@ SET_ROUNDING
Set rounding mode.
Definition ISDOpcodes.h:830
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:726
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition ISDOpcodes.h:583
@ BR
Control flow instructions. These all have token chains.
Definition ISDOpcodes.h:937
@ VECREDUCE_FADD
These reductions have relaxed evaluation order semantics, and have a single vector operand.
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition ISDOpcodes.h:674
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
@ ADDCARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:290
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
Definition ISDOpcodes.h:915
@ SETCCCARRY
Like SetCC, ops #0 and #1 are the LHS and RHS operands to compare, but op #2 is a boolean indicating ...
Definition ISDOpcodes.h:710
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition ISDOpcodes.h:866
@ BR_CC
BR_CC - Conditional branch.
Definition ISDOpcodes.h:963
@ SSUBO
Same for subtraction.
Definition ISDOpcodes.h:314
@ BR_JT
BR_JT - Jumptable branch.
Definition ISDOpcodes.h:946
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition ISDOpcodes.h:336
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:679
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
@ UNDEF
UNDEF - An undefined node.
Definition ISDOpcodes.h:211
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition ISDOpcodes.h:222
@ VACOPY
VACOPY - VACOPY has 5 operands: an input chain, a destination pointer, a source pointer,...
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition ISDOpcodes.h:208
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition ISDOpcodes.h:310
@ VECREDUCE_ADD
Integer reductions may have a result type larger than the vector element type.
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:614
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:657
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:563
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:549
@ READ_REGISTER
READ_REGISTER, WRITE_REGISTER - This node represents llvm.register on the DAG, which implements the n...
Definition ISDOpcodes.h:118
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:511
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:203
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:729
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:694
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition ISDOpcodes.h:898
@ SMULO
Same for multiplication.
Definition ISDOpcodes.h:318
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
Definition ISDOpcodes.h:931
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:747
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:626
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:833
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:688
@ FRAMEADDR
FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and llvm.returnaddress on the DAG.
Definition ISDOpcodes.h:94
@ STRICT_FP_TO_UINT
Definition ISDOpcodes.h:429
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition ISDOpcodes.h:451
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:428
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition ISDOpcodes.h:911
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:785
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:456
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:632
@ TRAP
TRAP - Trapping instruction.
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:184
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:500
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition ISDOpcodes.h:52
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:814
@ EH_SJLJ_SETJMP
RESULT, OUTCHAIN = EH_SJLJ_SETJMP(INCHAIN, buffer) This corresponds to the eh.sjlj....
Definition ISDOpcodes.h:141
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:735
@ VAARG
VAARG - VAARG has four operands: an input chain, a pointer, a SRCVALUE, and the alignment.
@ BRCOND
BRCOND - Conditional branch.
Definition ISDOpcodes.h:956
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition ISDOpcodes.h:715
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition ISDOpcodes.h:61
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:476
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition ISDOpcodes.h:327
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:192
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:491
bool isNormalStore(const SDNode *N)
Returns true if the specified node is a non-truncating and unindexed store.
bool isZEXTLoad(const SDNode *N)
Returns true if the specified node is a ZEXTLOAD.
CondCode getSetCCInverse(CondCode Operation, EVT Type)
Return the operation corresponding to !(X op Y), where 'op' is a valid SetCC operation.
bool isEXTLoad(const SDNode *N)
Returns true if the specified node is a EXTLOAD.
CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
bool isBuildVectorAllZeros(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are 0 or undef.
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
bool isConstantSplatVector(const SDNode *N, APInt &SplatValue)
Node predicates.
MemIndexedMode
MemIndexedMode enum - This enum defines the load / store indexed addressing modes.
bool isSEXTLoad(const SDNode *N)
Returns true if the specified node is a SEXTLOAD.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
static const int LAST_INDEXED_MODE
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=None)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Flag
These should be considered private to the implementation of the MCInstrDesc class.
bool match(Val *V, const Pattern &P)
cst_pred_ty< is_zero_int > m_ZeroInt()
Match an integer 0 or a vector with all elements equal to 0.
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
match_combine_or< CastClass_match< OpTy, Instruction::ZExt >, CastClass_match< OpTy, Instruction::SExt > > m_ZExtOrSExt(const OpTy &Op)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
auto m_Undef()
Match an arbitrary undef constant.
ThreeOps_match< Val_t, Elt_t, Idx_t, Instruction::InsertElement > m_InsertElt(const Val_t &Val, const Elt_t &Elt, const Idx_t &Idx)
Matches InsertElementInst.
Libcall getSINTTOFP(EVT OpVT, EVT RetVT)
getSINTTOFP - Return the SINTTOFP_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
Libcall getUINTTOFP(EVT OpVT, EVT RetVT)
getUINTTOFP - Return the UINTTOFP_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
Libcall
RTLIB::Libcall enum - This enum defines all of the runtime library calls the backend can emit.
Libcall getFPTOUINT(EVT OpVT, EVT RetVT)
getFPTOUINT - Return the FPTOUINT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
Libcall getFPTOSINT(EVT OpVT, EVT RetVT)
getFPTOSINT - Return the FPTOSINT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
Libcall getFPEXT(EVT OpVT, EVT RetVT)
getFPEXT - Return the FPEXT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
Libcall getFPROUND(EVT OpVT, EVT RetVT)
getFPROUND - Return the FPROUND_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ SingleThread
Synchronized with respect to signal handlers executing in the same thread.
Definition LLVMContext.h:55
initializer< Ty > init(const Ty &Val)
constexpr double e
Definition MathExtras.h:57
---------------------— PointerInfo ------------------------------------—
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1554
constexpr bool isUInt< 8 >(uint64_t x)
Definition MathExtras.h:405
bool CC_ARM_APCS_GHC(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
bool RetCC_ARM_AAPCS_VFP(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1534
bool HasLowerConstantMaterializationCost(unsigned Val1, unsigned Val2, const ARMSubtarget *Subtarget, bool ForCodesize=false)
Returns true if Val1 has a lower Constant Materialization Cost than Val2.
bool operator==(uint64_t V1, const APInt &V2)
Definition APInt.h:2030
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
bool isUIntN(unsigned N, uint64_t x)
Checks if an unsigned integer fits into the given (dynamic) bit width.
Definition MathExtras.h:455
bool isStrongerThanMonotonic(AtomicOrdering AO)
constexpr bool isMask_32(uint32_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition MathExtras.h:467
bool FastCC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
void append_range(Container &C, Range &&R)
Wrapper function to append a range to a container.
Definition STLExtras.h:1731
bool RetCC_ARM_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
Value * concatenateVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vecs)
Concatenate a list of vectors.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:496
void shuffle(Iterator first, Iterator last, RNG &&g)
Definition STLExtras.h:1361
static std::array< MachineOperand, 2 > predOps(ARMCC::CondCodes Pred, unsigned PredReg=0)
Get the operands corresponding to the given Pred value.
bool CC_ARM_AAPCS_VFP(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
constexpr bool isShiftedMask_32(uint32_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (32 bit ver...
Definition MathExtras.h:479
bool CC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
bool isReleaseOrStronger(AtomicOrdering AO)
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1541
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:596
bool isBitwiseNot(SDValue V, bool AllowUndefs=false)
Returns true if V is a bitwise not operation.
bool RetCC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
unsigned countLeadingZeros(T Val, ZeroBehavior ZB=ZB_Width)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition MathExtras.h:225
bool CC_ARM_Win32_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:491
constexpr size_t array_lengthof(T(&)[N])
Find the length of an array.
Definition STLExtras.h:1377
unsigned countTrailingZeros(T Val, ZeroBehavior ZB=ZB_Width)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition MathExtras.h:156
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:163
ArrayRef< T > makeArrayRef(const T &OneElt)
Construct an ArrayRef from a single element.
Definition ArrayRef.h:476
LLVM_ATTRIBUTE_NORETURN void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition Error.cpp:140
unsigned countTrailingOnes(T Value, ZeroBehavior ZB=ZB_Width)
Count the number of ones from the least significant bit to the first zero bit.
Definition MathExtras.h:525
raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
AtomicOrdering
Atomic ordering for LLVM's memory model.
@ Mod
The access may modify the value stored in memory.
detail::enumerator< R > enumerate(R &&TheRange)
Given an input range, returns a new range whose values are are pair (A,B) such that A is the 0-based ...
Definition STLExtras.h:1964
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
CombineLevel
Definition DAGCombine.h:15
@ BeforeLegalizeTypes
Definition DAGCombine.h:16
@ Mul
Product of integers.
@ And
Bitwise or logical AND of integers.
@ Add
Sum of integers.
bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition MathExtras.h:460
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:148
bool RetFastCC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
bool CC_ARM_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
constexpr uint64_t MinAlign(uint64_t A, uint64_t B)
A and B are either alignments or offsets.
Definition MathExtras.h:672
std::enable_if_t<!is_simple_type< Y >::value, typename cast_retty< X, const Y >::ret_type > cast(const Y &Val)
Definition Casting.h:254
ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
bool isAcquireOrStronger(AtomicOrdering AO)
MachineInstrBuilder BuildMI(MachineFunction &MF, const DebugLoc &DL, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
constexpr unsigned BitWidth
static MachineOperand t1CondCodeOp(bool isDead=false)
Get the operand corresponding to the conditional code result for Thumb1.
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:1624
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1561
bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
static MachineOperand condCodeOp(unsigned CCReg=0)
Get the operand corresponding to the conditional code result.
bool isVREVMask(ArrayRef< int > M, EVT VT, unsigned BlockSize)
isVREVMask - Check if a vector shuffle corresponds to a VREV instruction with the specified blocksize...
unsigned gettBLXrOpcode(const MachineFunction &MF)
llvm::SmallVector< int, 16 > createSequentialMask(unsigned Start, unsigned NumInts, unsigned NumUndefs)
Create a sequential shuffle mask.
unsigned convertAddSubFlagsOpcode(unsigned OldOpc)
Map pseudo instructions that imply an 'S' bit onto real opcodes.
bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:840
#define N
A collection of metadata nodes that might be associated with a memory access used by the alias-analys...
Definition Metadata.h:651
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:85
Extended Value Type.
Definition ValueTypes.h:35
EVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
Definition ValueTypes.h:94
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition ValueTypes.h:363
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:130
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:74
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition ValueTypes.h:257
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:140
ElementCount getVectorElementCount() const
Definition ValueTypes.h:323
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:341
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:353
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition ValueTypes.h:432
static EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:289
bool is128BitVector() const
Return true if this is a 128-bit vector type.
Definition ValueTypes.h:186
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition ValueTypes.h:65
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition ValueTypes.h:349
static EVT getFloatingPointVT(unsigned BitWidth)
Returns the EVT that represents a floating-point type with the given number of bits.
Definition ValueTypes.h:59
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:155
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:296
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:301
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition ValueTypes.h:150
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:309
EVT getHalfNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:415
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:145
bool is64BitVector() const
Return true if this is a 64-bit vector type.
Definition ValueTypes.h:181
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition KnownBits.h:284
static KnownBits commonBits(const KnownBits &LHS, const KnownBits &RHS)
Compute known bits common to LHS and RHS.
Definition KnownBits.h:289
bool isUnknown() const
Returns true if we don't know any bits.
Definition KnownBits.h:63
static KnownBits mul(const KnownBits &LHS, const KnownBits &RHS)
Compute known bits resulting from multiplying LHS and RHS.
unsigned getBitWidth() const
Get the bit width of this value.
Definition KnownBits.h:40
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
Definition KnownBits.h:161
void resetAll()
Resets the known state of all bits.
Definition KnownBits.h:66
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
Definition KnownBits.h:169
static KnownBits computeForAddSub(bool Add, bool NSW, const KnownBits &LHS, KnownBits RHS)
Compute known bits resulting from adding LHS and RHS.
Definition KnownBits.cpp:57
This class contains a discriminated union of information about pointers in memory operands,...
static MachinePointerInfo getJumpTable(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a jump table entry.
static MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
static MachinePointerInfo getConstantPool(MachineFunction &MF)
Return a MachinePointerInfo record that refers to the constant pool.
static MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:109
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg If BaseGV is null...
This contains information for each constraint that we are lowering.
This structure contains all information that is necessary for lowering calls.
CallLoweringInfo & setInRegister(bool Value=true)
CallLoweringInfo & setLibCallee(CallingConv::ID CC, Type *ResultType, SDValue Target, ArgListTy &&ArgsList)
SmallVector< ISD::InputArg, 32 > Ins
CallLoweringInfo & setCallee(CallingConv::ID CC, Type *ResultType, SDValue Target, ArgListTy &&ArgsList)
CallLoweringInfo & setDiscardResult(bool Value=true)
CallLoweringInfo & setZExtResult(bool Value=true)
CallLoweringInfo & setDebugLoc(const SDLoc &dl)
CallLoweringInfo & setSExtResult(bool Value=true)
SmallVector< ISD::OutputArg, 32 > Outs
CallLoweringInfo & setChain(SDValue InChain)
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...